polars-runtime-compat 1.34.0b2__cp39-abi3-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of polars-runtime-compat might be problematic. Click here for more details.
- _polars_runtime_compat/.gitkeep +0 -0
- _polars_runtime_compat/_polars_runtime_compat.abi3.so +0 -0
- polars/__init__.py +528 -0
- polars/_cpu_check.py +265 -0
- polars/_dependencies.py +355 -0
- polars/_plr.py +99 -0
- polars/_plr.pyi +2496 -0
- polars/_reexport.py +23 -0
- polars/_typing.py +478 -0
- polars/_utils/__init__.py +37 -0
- polars/_utils/async_.py +102 -0
- polars/_utils/cache.py +176 -0
- polars/_utils/cloud.py +40 -0
- polars/_utils/constants.py +29 -0
- polars/_utils/construction/__init__.py +46 -0
- polars/_utils/construction/dataframe.py +1397 -0
- polars/_utils/construction/other.py +72 -0
- polars/_utils/construction/series.py +560 -0
- polars/_utils/construction/utils.py +118 -0
- polars/_utils/convert.py +224 -0
- polars/_utils/deprecation.py +406 -0
- polars/_utils/getitem.py +457 -0
- polars/_utils/logging.py +11 -0
- polars/_utils/nest_asyncio.py +264 -0
- polars/_utils/parquet.py +15 -0
- polars/_utils/parse/__init__.py +12 -0
- polars/_utils/parse/expr.py +242 -0
- polars/_utils/polars_version.py +19 -0
- polars/_utils/pycapsule.py +53 -0
- polars/_utils/scan.py +27 -0
- polars/_utils/serde.py +63 -0
- polars/_utils/slice.py +215 -0
- polars/_utils/udfs.py +1251 -0
- polars/_utils/unstable.py +63 -0
- polars/_utils/various.py +782 -0
- polars/_utils/wrap.py +25 -0
- polars/api.py +370 -0
- polars/catalog/__init__.py +0 -0
- polars/catalog/unity/__init__.py +19 -0
- polars/catalog/unity/client.py +733 -0
- polars/catalog/unity/models.py +152 -0
- polars/config.py +1571 -0
- polars/convert/__init__.py +25 -0
- polars/convert/general.py +1046 -0
- polars/convert/normalize.py +261 -0
- polars/dataframe/__init__.py +5 -0
- polars/dataframe/_html.py +186 -0
- polars/dataframe/frame.py +12582 -0
- polars/dataframe/group_by.py +1067 -0
- polars/dataframe/plotting.py +257 -0
- polars/datatype_expr/__init__.py +5 -0
- polars/datatype_expr/array.py +56 -0
- polars/datatype_expr/datatype_expr.py +304 -0
- polars/datatype_expr/list.py +18 -0
- polars/datatype_expr/struct.py +69 -0
- polars/datatypes/__init__.py +122 -0
- polars/datatypes/_parse.py +195 -0
- polars/datatypes/_utils.py +48 -0
- polars/datatypes/classes.py +1213 -0
- polars/datatypes/constants.py +11 -0
- polars/datatypes/constructor.py +172 -0
- polars/datatypes/convert.py +366 -0
- polars/datatypes/group.py +130 -0
- polars/exceptions.py +230 -0
- polars/expr/__init__.py +7 -0
- polars/expr/array.py +964 -0
- polars/expr/binary.py +346 -0
- polars/expr/categorical.py +306 -0
- polars/expr/datetime.py +2620 -0
- polars/expr/expr.py +11272 -0
- polars/expr/list.py +1408 -0
- polars/expr/meta.py +444 -0
- polars/expr/name.py +321 -0
- polars/expr/string.py +3045 -0
- polars/expr/struct.py +357 -0
- polars/expr/whenthen.py +185 -0
- polars/functions/__init__.py +193 -0
- polars/functions/aggregation/__init__.py +33 -0
- polars/functions/aggregation/horizontal.py +298 -0
- polars/functions/aggregation/vertical.py +341 -0
- polars/functions/as_datatype.py +848 -0
- polars/functions/business.py +138 -0
- polars/functions/col.py +384 -0
- polars/functions/datatype.py +121 -0
- polars/functions/eager.py +524 -0
- polars/functions/escape_regex.py +29 -0
- polars/functions/lazy.py +2751 -0
- polars/functions/len.py +68 -0
- polars/functions/lit.py +210 -0
- polars/functions/random.py +22 -0
- polars/functions/range/__init__.py +19 -0
- polars/functions/range/_utils.py +15 -0
- polars/functions/range/date_range.py +303 -0
- polars/functions/range/datetime_range.py +370 -0
- polars/functions/range/int_range.py +348 -0
- polars/functions/range/linear_space.py +311 -0
- polars/functions/range/time_range.py +287 -0
- polars/functions/repeat.py +301 -0
- polars/functions/whenthen.py +353 -0
- polars/interchange/__init__.py +10 -0
- polars/interchange/buffer.py +77 -0
- polars/interchange/column.py +190 -0
- polars/interchange/dataframe.py +230 -0
- polars/interchange/from_dataframe.py +328 -0
- polars/interchange/protocol.py +303 -0
- polars/interchange/utils.py +170 -0
- polars/io/__init__.py +64 -0
- polars/io/_utils.py +317 -0
- polars/io/avro.py +49 -0
- polars/io/clipboard.py +36 -0
- polars/io/cloud/__init__.py +17 -0
- polars/io/cloud/_utils.py +80 -0
- polars/io/cloud/credential_provider/__init__.py +17 -0
- polars/io/cloud/credential_provider/_builder.py +520 -0
- polars/io/cloud/credential_provider/_providers.py +618 -0
- polars/io/csv/__init__.py +9 -0
- polars/io/csv/_utils.py +38 -0
- polars/io/csv/batched_reader.py +142 -0
- polars/io/csv/functions.py +1495 -0
- polars/io/database/__init__.py +6 -0
- polars/io/database/_arrow_registry.py +70 -0
- polars/io/database/_cursor_proxies.py +147 -0
- polars/io/database/_executor.py +578 -0
- polars/io/database/_inference.py +314 -0
- polars/io/database/_utils.py +144 -0
- polars/io/database/functions.py +516 -0
- polars/io/delta.py +499 -0
- polars/io/iceberg/__init__.py +3 -0
- polars/io/iceberg/_utils.py +697 -0
- polars/io/iceberg/dataset.py +556 -0
- polars/io/iceberg/functions.py +151 -0
- polars/io/ipc/__init__.py +8 -0
- polars/io/ipc/functions.py +514 -0
- polars/io/json/__init__.py +3 -0
- polars/io/json/read.py +101 -0
- polars/io/ndjson.py +332 -0
- polars/io/parquet/__init__.py +17 -0
- polars/io/parquet/field_overwrites.py +140 -0
- polars/io/parquet/functions.py +722 -0
- polars/io/partition.py +491 -0
- polars/io/plugins.py +187 -0
- polars/io/pyarrow_dataset/__init__.py +5 -0
- polars/io/pyarrow_dataset/anonymous_scan.py +109 -0
- polars/io/pyarrow_dataset/functions.py +79 -0
- polars/io/scan_options/__init__.py +5 -0
- polars/io/scan_options/_options.py +59 -0
- polars/io/scan_options/cast_options.py +126 -0
- polars/io/spreadsheet/__init__.py +6 -0
- polars/io/spreadsheet/_utils.py +52 -0
- polars/io/spreadsheet/_write_utils.py +647 -0
- polars/io/spreadsheet/functions.py +1323 -0
- polars/lazyframe/__init__.py +9 -0
- polars/lazyframe/engine_config.py +61 -0
- polars/lazyframe/frame.py +8564 -0
- polars/lazyframe/group_by.py +669 -0
- polars/lazyframe/in_process.py +42 -0
- polars/lazyframe/opt_flags.py +333 -0
- polars/meta/__init__.py +14 -0
- polars/meta/build.py +33 -0
- polars/meta/index_type.py +27 -0
- polars/meta/thread_pool.py +50 -0
- polars/meta/versions.py +120 -0
- polars/ml/__init__.py +0 -0
- polars/ml/torch.py +213 -0
- polars/ml/utilities.py +30 -0
- polars/plugins.py +155 -0
- polars/py.typed +0 -0
- polars/pyproject.toml +96 -0
- polars/schema.py +265 -0
- polars/selectors.py +3117 -0
- polars/series/__init__.py +5 -0
- polars/series/array.py +776 -0
- polars/series/binary.py +254 -0
- polars/series/categorical.py +246 -0
- polars/series/datetime.py +2275 -0
- polars/series/list.py +1087 -0
- polars/series/plotting.py +191 -0
- polars/series/series.py +9197 -0
- polars/series/string.py +2367 -0
- polars/series/struct.py +154 -0
- polars/series/utils.py +191 -0
- polars/sql/__init__.py +7 -0
- polars/sql/context.py +677 -0
- polars/sql/functions.py +139 -0
- polars/string_cache.py +185 -0
- polars/testing/__init__.py +13 -0
- polars/testing/asserts/__init__.py +9 -0
- polars/testing/asserts/frame.py +231 -0
- polars/testing/asserts/series.py +219 -0
- polars/testing/asserts/utils.py +12 -0
- polars/testing/parametric/__init__.py +33 -0
- polars/testing/parametric/profiles.py +107 -0
- polars/testing/parametric/strategies/__init__.py +22 -0
- polars/testing/parametric/strategies/_utils.py +14 -0
- polars/testing/parametric/strategies/core.py +615 -0
- polars/testing/parametric/strategies/data.py +452 -0
- polars/testing/parametric/strategies/dtype.py +436 -0
- polars/testing/parametric/strategies/legacy.py +169 -0
- polars/type_aliases.py +24 -0
- polars_runtime_compat-1.34.0b2.dist-info/METADATA +190 -0
- polars_runtime_compat-1.34.0b2.dist-info/RECORD +203 -0
- polars_runtime_compat-1.34.0b2.dist-info/WHEEL +4 -0
- polars_runtime_compat-1.34.0b2.dist-info/licenses/LICENSE +20 -0
polars/functions/lazy.py
ADDED
|
@@ -0,0 +1,2751 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import contextlib
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Callable, overload
|
|
5
|
+
|
|
6
|
+
import polars._reexport as pl
|
|
7
|
+
import polars.functions as F
|
|
8
|
+
import polars.selectors as cs
|
|
9
|
+
from polars._dependencies import _check_for_numpy
|
|
10
|
+
from polars._dependencies import numpy as np
|
|
11
|
+
from polars._utils.async_ import _AioDataFrameResult, _GeventDataFrameResult
|
|
12
|
+
from polars._utils.deprecation import (
|
|
13
|
+
deprecate_renamed_parameter,
|
|
14
|
+
deprecate_streaming_parameter,
|
|
15
|
+
deprecated,
|
|
16
|
+
issue_deprecation_warning,
|
|
17
|
+
)
|
|
18
|
+
from polars._utils.parse import (
|
|
19
|
+
parse_into_expression,
|
|
20
|
+
parse_into_list_of_expressions,
|
|
21
|
+
)
|
|
22
|
+
from polars._utils.unstable import issue_unstable_warning, unstable
|
|
23
|
+
from polars._utils.various import extend_bool, qualified_type_name
|
|
24
|
+
from polars._utils.wrap import wrap_df, wrap_expr, wrap_s
|
|
25
|
+
from polars.datatypes import DTYPE_TEMPORAL_UNITS, Date, Datetime, Int64
|
|
26
|
+
from polars.datatypes._parse import parse_into_datatype_expr
|
|
27
|
+
from polars.lazyframe.opt_flags import (
|
|
28
|
+
DEFAULT_QUERY_OPT_FLAGS,
|
|
29
|
+
forward_old_opt_flags,
|
|
30
|
+
)
|
|
31
|
+
from polars.meta.index_type import get_index_type
|
|
32
|
+
|
|
33
|
+
with contextlib.suppress(ImportError): # Module not available when building docs
|
|
34
|
+
import polars._plr as plr
|
|
35
|
+
|
|
36
|
+
if TYPE_CHECKING:
|
|
37
|
+
import sys
|
|
38
|
+
from collections.abc import Awaitable, Collection, Iterable, Sequence
|
|
39
|
+
from typing import Literal
|
|
40
|
+
|
|
41
|
+
from polars import DataFrame, Expr, LazyFrame, Series
|
|
42
|
+
from polars._typing import (
|
|
43
|
+
CorrelationMethod,
|
|
44
|
+
EngineType,
|
|
45
|
+
EpochTimeUnit,
|
|
46
|
+
IntoExpr,
|
|
47
|
+
PolarsDataType,
|
|
48
|
+
QuantileMethod,
|
|
49
|
+
)
|
|
50
|
+
from polars.lazyframe.opt_flags import (
|
|
51
|
+
QueryOptFlags,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
if sys.version_info >= (3, 13):
|
|
55
|
+
from warnings import deprecated
|
|
56
|
+
else:
|
|
57
|
+
from typing_extensions import deprecated # noqa: TC004
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def field(name: str | list[str]) -> Expr:
|
|
61
|
+
"""
|
|
62
|
+
Select a field in the current `struct.with_fields` scope.
|
|
63
|
+
|
|
64
|
+
name
|
|
65
|
+
Name of the field(s) to select.
|
|
66
|
+
"""
|
|
67
|
+
if isinstance(name, str):
|
|
68
|
+
name = [name]
|
|
69
|
+
return wrap_expr(plr.field(name))
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def element() -> Expr:
|
|
73
|
+
"""
|
|
74
|
+
Alias for an element being evaluated in an `eval` or `filter` expression.
|
|
75
|
+
|
|
76
|
+
Examples
|
|
77
|
+
--------
|
|
78
|
+
A horizontal rank computation by taking the elements of a list
|
|
79
|
+
|
|
80
|
+
>>> df = pl.DataFrame(
|
|
81
|
+
... {
|
|
82
|
+
... "a": [1, 8, 3],
|
|
83
|
+
... "b": [4, 5, 2],
|
|
84
|
+
... }
|
|
85
|
+
... )
|
|
86
|
+
>>> df.with_columns(
|
|
87
|
+
... pl.concat_list(["a", "b"]).list.eval(pl.element().rank()).alias("rank")
|
|
88
|
+
... )
|
|
89
|
+
shape: (3, 3)
|
|
90
|
+
┌─────┬─────┬────────────┐
|
|
91
|
+
│ a ┆ b ┆ rank │
|
|
92
|
+
│ --- ┆ --- ┆ --- │
|
|
93
|
+
│ i64 ┆ i64 ┆ list[f64] │
|
|
94
|
+
╞═════╪═════╪════════════╡
|
|
95
|
+
│ 1 ┆ 4 ┆ [1.0, 2.0] │
|
|
96
|
+
│ 8 ┆ 5 ┆ [2.0, 1.0] │
|
|
97
|
+
│ 3 ┆ 2 ┆ [2.0, 1.0] │
|
|
98
|
+
└─────┴─────┴────────────┘
|
|
99
|
+
|
|
100
|
+
A mathematical operation on array elements
|
|
101
|
+
|
|
102
|
+
>>> df = pl.DataFrame(
|
|
103
|
+
... {
|
|
104
|
+
... "a": [1, 8, 3],
|
|
105
|
+
... "b": [4, 5, 2],
|
|
106
|
+
... }
|
|
107
|
+
... )
|
|
108
|
+
>>> df.with_columns(
|
|
109
|
+
... pl.concat_list(["a", "b"]).list.eval(pl.element() * 2).alias("a_b_doubled")
|
|
110
|
+
... )
|
|
111
|
+
shape: (3, 3)
|
|
112
|
+
┌─────┬─────┬─────────────┐
|
|
113
|
+
│ a ┆ b ┆ a_b_doubled │
|
|
114
|
+
│ --- ┆ --- ┆ --- │
|
|
115
|
+
│ i64 ┆ i64 ┆ list[i64] │
|
|
116
|
+
╞═════╪═════╪═════════════╡
|
|
117
|
+
│ 1 ┆ 4 ┆ [2, 8] │
|
|
118
|
+
│ 8 ┆ 5 ┆ [16, 10] │
|
|
119
|
+
│ 3 ┆ 2 ┆ [6, 4] │
|
|
120
|
+
└─────┴─────┴─────────────┘
|
|
121
|
+
|
|
122
|
+
A filter operation on list elements
|
|
123
|
+
|
|
124
|
+
>>> import polars as pl
|
|
125
|
+
>>> df = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2]})
|
|
126
|
+
>>> df.with_columns(
|
|
127
|
+
... evens=pl.concat_list("a", "b").list.filter(pl.element() % 2 == 0)
|
|
128
|
+
... )
|
|
129
|
+
shape: (3, 3)
|
|
130
|
+
┌─────┬─────┬───────────┐
|
|
131
|
+
│ a ┆ b ┆ evens │
|
|
132
|
+
│ --- ┆ --- ┆ --- │
|
|
133
|
+
│ i64 ┆ i64 ┆ list[i64] │
|
|
134
|
+
╞═════╪═════╪═══════════╡
|
|
135
|
+
│ 1 ┆ 4 ┆ [4] │
|
|
136
|
+
│ 8 ┆ 5 ┆ [8] │
|
|
137
|
+
│ 3 ┆ 2 ┆ [2] │
|
|
138
|
+
└─────┴─────┴───────────┘
|
|
139
|
+
"""
|
|
140
|
+
return F.col("")
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def count(*columns: str) -> Expr:
|
|
144
|
+
"""
|
|
145
|
+
Return the number of non-null values in the column.
|
|
146
|
+
|
|
147
|
+
This function is syntactic sugar for `col(columns).count()`.
|
|
148
|
+
|
|
149
|
+
Calling this function without any arguments returns the number of rows in the
|
|
150
|
+
context. **This way of using the function is deprecated.** Please use :func:`len`
|
|
151
|
+
instead.
|
|
152
|
+
|
|
153
|
+
Parameters
|
|
154
|
+
----------
|
|
155
|
+
*columns
|
|
156
|
+
One or more column names.
|
|
157
|
+
|
|
158
|
+
Returns
|
|
159
|
+
-------
|
|
160
|
+
Expr
|
|
161
|
+
Expression of data type :class:`UInt32`.
|
|
162
|
+
|
|
163
|
+
See Also
|
|
164
|
+
--------
|
|
165
|
+
Expr.count
|
|
166
|
+
|
|
167
|
+
Examples
|
|
168
|
+
--------
|
|
169
|
+
>>> df = pl.DataFrame(
|
|
170
|
+
... {
|
|
171
|
+
... "a": [1, 2, None],
|
|
172
|
+
... "b": [3, None, None],
|
|
173
|
+
... "c": ["foo", "bar", "foo"],
|
|
174
|
+
... }
|
|
175
|
+
... )
|
|
176
|
+
>>> df.select(pl.count("a"))
|
|
177
|
+
shape: (1, 1)
|
|
178
|
+
┌─────┐
|
|
179
|
+
│ a │
|
|
180
|
+
│ --- │
|
|
181
|
+
│ u32 │
|
|
182
|
+
╞═════╡
|
|
183
|
+
│ 2 │
|
|
184
|
+
└─────┘
|
|
185
|
+
|
|
186
|
+
Return the number of non-null values in multiple columns.
|
|
187
|
+
|
|
188
|
+
>>> df.select(pl.count("b", "c"))
|
|
189
|
+
shape: (1, 2)
|
|
190
|
+
┌─────┬─────┐
|
|
191
|
+
│ b ┆ c │
|
|
192
|
+
│ --- ┆ --- │
|
|
193
|
+
│ u32 ┆ u32 │
|
|
194
|
+
╞═════╪═════╡
|
|
195
|
+
│ 1 ┆ 3 │
|
|
196
|
+
└─────┴─────┘
|
|
197
|
+
|
|
198
|
+
Return the number of rows in a context. **This way of using the function is
|
|
199
|
+
deprecated.** Please use :func:`len` instead.
|
|
200
|
+
|
|
201
|
+
>>> df.select(pl.count()) # doctest: +SKIP
|
|
202
|
+
shape: (1, 1)
|
|
203
|
+
┌───────┐
|
|
204
|
+
│ count │
|
|
205
|
+
│ --- │
|
|
206
|
+
│ u32 │
|
|
207
|
+
╞═══════╡
|
|
208
|
+
│ 3 │
|
|
209
|
+
└───────┘
|
|
210
|
+
"""
|
|
211
|
+
if not columns:
|
|
212
|
+
issue_deprecation_warning(
|
|
213
|
+
"`pl.count()` is deprecated. Please use `pl.len()` instead.",
|
|
214
|
+
version="0.20.5",
|
|
215
|
+
)
|
|
216
|
+
return F.len().alias("count")
|
|
217
|
+
return F.col(*columns).count()
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def cum_count(*columns: str, reverse: bool = False) -> Expr:
|
|
221
|
+
"""
|
|
222
|
+
Return the cumulative count of the non-null values in the column.
|
|
223
|
+
|
|
224
|
+
This function is syntactic sugar for `col(columns).cum_count()`.
|
|
225
|
+
|
|
226
|
+
Parameters
|
|
227
|
+
----------
|
|
228
|
+
*columns
|
|
229
|
+
Name(s) of the columns to use.
|
|
230
|
+
reverse
|
|
231
|
+
Reverse the operation.
|
|
232
|
+
|
|
233
|
+
Examples
|
|
234
|
+
--------
|
|
235
|
+
>>> df = pl.DataFrame({"a": [1, 2, None], "b": [3, None, None]})
|
|
236
|
+
>>> df.with_columns(
|
|
237
|
+
... ca=pl.cum_count("a"),
|
|
238
|
+
... cb=pl.cum_count("b"),
|
|
239
|
+
... )
|
|
240
|
+
shape: (3, 4)
|
|
241
|
+
┌──────┬──────┬─────┬─────┐
|
|
242
|
+
│ a ┆ b ┆ ca ┆ cb │
|
|
243
|
+
│ --- ┆ --- ┆ --- ┆ --- │
|
|
244
|
+
│ i64 ┆ i64 ┆ u32 ┆ u32 │
|
|
245
|
+
╞══════╪══════╪═════╪═════╡
|
|
246
|
+
│ 1 ┆ 3 ┆ 1 ┆ 1 │
|
|
247
|
+
│ 2 ┆ null ┆ 2 ┆ 1 │
|
|
248
|
+
│ null ┆ null ┆ 2 ┆ 1 │
|
|
249
|
+
└──────┴──────┴─────┴─────┘
|
|
250
|
+
"""
|
|
251
|
+
return F.col(*columns).cum_count(reverse=reverse)
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def implode(*columns: str) -> Expr:
|
|
255
|
+
"""
|
|
256
|
+
Aggregate all column values into a list.
|
|
257
|
+
|
|
258
|
+
This function is syntactic sugar for `pl.col(name).implode()`.
|
|
259
|
+
|
|
260
|
+
Parameters
|
|
261
|
+
----------
|
|
262
|
+
*columns
|
|
263
|
+
One or more column names.
|
|
264
|
+
|
|
265
|
+
Examples
|
|
266
|
+
--------
|
|
267
|
+
>>> df = pl.DataFrame(
|
|
268
|
+
... {
|
|
269
|
+
... "a": [1, 2, 3],
|
|
270
|
+
... "b": [9, 8, 7],
|
|
271
|
+
... "c": ["foo", "bar", "foo"],
|
|
272
|
+
... }
|
|
273
|
+
... )
|
|
274
|
+
>>> df.select(pl.implode("a"))
|
|
275
|
+
shape: (1, 1)
|
|
276
|
+
┌───────────┐
|
|
277
|
+
│ a │
|
|
278
|
+
│ --- │
|
|
279
|
+
│ list[i64] │
|
|
280
|
+
╞═══════════╡
|
|
281
|
+
│ [1, 2, 3] │
|
|
282
|
+
└───────────┘
|
|
283
|
+
>>> df.select(pl.implode("b", "c"))
|
|
284
|
+
shape: (1, 2)
|
|
285
|
+
┌───────────┬───────────────────────┐
|
|
286
|
+
│ b ┆ c │
|
|
287
|
+
│ --- ┆ --- │
|
|
288
|
+
│ list[i64] ┆ list[str] │
|
|
289
|
+
╞═══════════╪═══════════════════════╡
|
|
290
|
+
│ [9, 8, 7] ┆ ["foo", "bar", "foo"] │
|
|
291
|
+
└───────────┴───────────────────────┘
|
|
292
|
+
|
|
293
|
+
"""
|
|
294
|
+
return F.col(*columns).implode()
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def std(column: str, ddof: int = 1) -> Expr:
|
|
298
|
+
"""
|
|
299
|
+
Get the standard deviation.
|
|
300
|
+
|
|
301
|
+
This function is syntactic sugar for `pl.col(column).std(ddof)`.
|
|
302
|
+
|
|
303
|
+
Parameters
|
|
304
|
+
----------
|
|
305
|
+
column
|
|
306
|
+
Column name.
|
|
307
|
+
ddof
|
|
308
|
+
“Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof,
|
|
309
|
+
where N represents the number of elements.
|
|
310
|
+
By default ddof is 1.
|
|
311
|
+
|
|
312
|
+
Examples
|
|
313
|
+
--------
|
|
314
|
+
>>> df = pl.DataFrame(
|
|
315
|
+
... {
|
|
316
|
+
... "a": [1, 8, 3],
|
|
317
|
+
... "b": [4, 5, 2],
|
|
318
|
+
... "c": ["foo", "bar", "foo"],
|
|
319
|
+
... }
|
|
320
|
+
... )
|
|
321
|
+
>>> df.select(pl.std("a"))
|
|
322
|
+
shape: (1, 1)
|
|
323
|
+
┌──────────┐
|
|
324
|
+
│ a │
|
|
325
|
+
│ --- │
|
|
326
|
+
│ f64 │
|
|
327
|
+
╞══════════╡
|
|
328
|
+
│ 3.605551 │
|
|
329
|
+
└──────────┘
|
|
330
|
+
>>> df["a"].std()
|
|
331
|
+
3.605551275463989
|
|
332
|
+
"""
|
|
333
|
+
return F.col(column).std(ddof)
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
def var(column: str, ddof: int = 1) -> Expr:
|
|
337
|
+
"""
|
|
338
|
+
Get the variance.
|
|
339
|
+
|
|
340
|
+
This function is syntactic sugar for `pl.col(column).var(ddof)`.
|
|
341
|
+
|
|
342
|
+
Parameters
|
|
343
|
+
----------
|
|
344
|
+
column
|
|
345
|
+
Column name.
|
|
346
|
+
ddof
|
|
347
|
+
“Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof,
|
|
348
|
+
where N represents the number of elements.
|
|
349
|
+
By default ddof is 1.
|
|
350
|
+
|
|
351
|
+
Examples
|
|
352
|
+
--------
|
|
353
|
+
>>> df = pl.DataFrame(
|
|
354
|
+
... {
|
|
355
|
+
... "a": [1, 8, 3],
|
|
356
|
+
... "b": [4, 5, 2],
|
|
357
|
+
... "c": ["foo", "bar", "foo"],
|
|
358
|
+
... },
|
|
359
|
+
... )
|
|
360
|
+
>>> df.select(pl.var("a"))
|
|
361
|
+
shape: (1, 1)
|
|
362
|
+
┌──────┐
|
|
363
|
+
│ a │
|
|
364
|
+
│ --- │
|
|
365
|
+
│ f64 │
|
|
366
|
+
╞══════╡
|
|
367
|
+
│ 13.0 │
|
|
368
|
+
└──────┘
|
|
369
|
+
>>> df["a"].var()
|
|
370
|
+
13.0
|
|
371
|
+
"""
|
|
372
|
+
return F.col(column).var(ddof)
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def mean(*columns: str) -> Expr:
|
|
376
|
+
"""
|
|
377
|
+
Get the mean value.
|
|
378
|
+
|
|
379
|
+
This function is syntactic sugar for `pl.col(columns).mean()`.
|
|
380
|
+
|
|
381
|
+
Parameters
|
|
382
|
+
----------
|
|
383
|
+
*columns
|
|
384
|
+
One or more column names.
|
|
385
|
+
|
|
386
|
+
See Also
|
|
387
|
+
--------
|
|
388
|
+
mean_horizontal
|
|
389
|
+
|
|
390
|
+
Examples
|
|
391
|
+
--------
|
|
392
|
+
>>> df = pl.DataFrame(
|
|
393
|
+
... {
|
|
394
|
+
... "a": [1, 8, 3],
|
|
395
|
+
... "b": [4, 5, 2],
|
|
396
|
+
... "c": ["foo", "bar", "foo"],
|
|
397
|
+
... }
|
|
398
|
+
... )
|
|
399
|
+
>>> df.select(pl.mean("a"))
|
|
400
|
+
shape: (1, 1)
|
|
401
|
+
┌─────┐
|
|
402
|
+
│ a │
|
|
403
|
+
│ --- │
|
|
404
|
+
│ f64 │
|
|
405
|
+
╞═════╡
|
|
406
|
+
│ 4.0 │
|
|
407
|
+
└─────┘
|
|
408
|
+
>>> df.select(pl.mean("a", "b"))
|
|
409
|
+
shape: (1, 2)
|
|
410
|
+
┌─────┬──────────┐
|
|
411
|
+
│ a ┆ b │
|
|
412
|
+
│ --- ┆ --- │
|
|
413
|
+
│ f64 ┆ f64 │
|
|
414
|
+
╞═════╪══════════╡
|
|
415
|
+
│ 4.0 ┆ 3.666667 │
|
|
416
|
+
└─────┴──────────┘
|
|
417
|
+
|
|
418
|
+
"""
|
|
419
|
+
return F.col(*columns).mean()
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
def median(*columns: str) -> Expr:
|
|
423
|
+
"""
|
|
424
|
+
Get the median value.
|
|
425
|
+
|
|
426
|
+
This function is syntactic sugar for `pl.col(columns).median()`.
|
|
427
|
+
|
|
428
|
+
Parameters
|
|
429
|
+
----------
|
|
430
|
+
columns
|
|
431
|
+
One or more column names.
|
|
432
|
+
|
|
433
|
+
Examples
|
|
434
|
+
--------
|
|
435
|
+
>>> df = pl.DataFrame(
|
|
436
|
+
... {
|
|
437
|
+
... "a": [1, 8, 3],
|
|
438
|
+
... "b": [4, 5, 2],
|
|
439
|
+
... "c": ["foo", "bar", "foo"],
|
|
440
|
+
... }
|
|
441
|
+
... )
|
|
442
|
+
>>> df.select(pl.median("a"))
|
|
443
|
+
shape: (1, 1)
|
|
444
|
+
┌─────┐
|
|
445
|
+
│ a │
|
|
446
|
+
│ --- │
|
|
447
|
+
│ f64 │
|
|
448
|
+
╞═════╡
|
|
449
|
+
│ 3.0 │
|
|
450
|
+
└─────┘
|
|
451
|
+
>>> df.select(pl.median("a", "b"))
|
|
452
|
+
shape: (1, 2)
|
|
453
|
+
┌─────┬─────┐
|
|
454
|
+
│ a ┆ b │
|
|
455
|
+
│ --- ┆ --- │
|
|
456
|
+
│ f64 ┆ f64 │
|
|
457
|
+
╞═════╪═════╡
|
|
458
|
+
│ 3.0 ┆ 4.0 │
|
|
459
|
+
└─────┴─────┘
|
|
460
|
+
|
|
461
|
+
"""
|
|
462
|
+
return F.col(*columns).median()
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
def n_unique(*columns: str) -> Expr:
|
|
466
|
+
"""
|
|
467
|
+
Count unique values.
|
|
468
|
+
|
|
469
|
+
This function is syntactic sugar for `pl.col(columns).n_unique()`.
|
|
470
|
+
|
|
471
|
+
Parameters
|
|
472
|
+
----------
|
|
473
|
+
columns
|
|
474
|
+
One or more column names.
|
|
475
|
+
|
|
476
|
+
Examples
|
|
477
|
+
--------
|
|
478
|
+
>>> df = pl.DataFrame(
|
|
479
|
+
... {
|
|
480
|
+
... "a": [1, 8, 1],
|
|
481
|
+
... "b": [4, 5, 2],
|
|
482
|
+
... "c": ["foo", "bar", "foo"],
|
|
483
|
+
... }
|
|
484
|
+
... )
|
|
485
|
+
>>> df.select(pl.n_unique("a"))
|
|
486
|
+
shape: (1, 1)
|
|
487
|
+
┌─────┐
|
|
488
|
+
│ a │
|
|
489
|
+
│ --- │
|
|
490
|
+
│ u32 │
|
|
491
|
+
╞═════╡
|
|
492
|
+
│ 2 │
|
|
493
|
+
└─────┘
|
|
494
|
+
>>> df.select(pl.n_unique("b", "c"))
|
|
495
|
+
shape: (1, 2)
|
|
496
|
+
┌─────┬─────┐
|
|
497
|
+
│ b ┆ c │
|
|
498
|
+
│ --- ┆ --- │
|
|
499
|
+
│ u32 ┆ u32 │
|
|
500
|
+
╞═════╪═════╡
|
|
501
|
+
│ 3 ┆ 2 │
|
|
502
|
+
└─────┴─────┘
|
|
503
|
+
|
|
504
|
+
"""
|
|
505
|
+
return F.col(*columns).n_unique()
|
|
506
|
+
|
|
507
|
+
|
|
508
|
+
def approx_n_unique(*columns: str) -> Expr:
|
|
509
|
+
"""
|
|
510
|
+
Approximate count of unique values.
|
|
511
|
+
|
|
512
|
+
This function is syntactic sugar for `pl.col(columns).approx_n_unique()`, and
|
|
513
|
+
uses the HyperLogLog++ algorithm for cardinality estimation.
|
|
514
|
+
|
|
515
|
+
Parameters
|
|
516
|
+
----------
|
|
517
|
+
columns
|
|
518
|
+
One or more column names.
|
|
519
|
+
|
|
520
|
+
Examples
|
|
521
|
+
--------
|
|
522
|
+
>>> df = pl.DataFrame(
|
|
523
|
+
... {
|
|
524
|
+
... "a": [1, 8, 1],
|
|
525
|
+
... "b": [4, 5, 2],
|
|
526
|
+
... "c": ["foo", "bar", "foo"],
|
|
527
|
+
... }
|
|
528
|
+
... )
|
|
529
|
+
>>> df.select(pl.approx_n_unique("a"))
|
|
530
|
+
shape: (1, 1)
|
|
531
|
+
┌─────┐
|
|
532
|
+
│ a │
|
|
533
|
+
│ --- │
|
|
534
|
+
│ u32 │
|
|
535
|
+
╞═════╡
|
|
536
|
+
│ 2 │
|
|
537
|
+
└─────┘
|
|
538
|
+
>>> df.select(pl.approx_n_unique("b", "c"))
|
|
539
|
+
shape: (1, 2)
|
|
540
|
+
┌─────┬─────┐
|
|
541
|
+
│ b ┆ c │
|
|
542
|
+
│ --- ┆ --- │
|
|
543
|
+
│ u32 ┆ u32 │
|
|
544
|
+
╞═════╪═════╡
|
|
545
|
+
│ 3 ┆ 2 │
|
|
546
|
+
└─────┴─────┘
|
|
547
|
+
|
|
548
|
+
"""
|
|
549
|
+
return F.col(*columns).approx_n_unique()
|
|
550
|
+
|
|
551
|
+
|
|
552
|
+
def first(*columns: str) -> Expr:
|
|
553
|
+
"""
|
|
554
|
+
Get the first column or value.
|
|
555
|
+
|
|
556
|
+
This function has different behavior depending on the presence of `columns`
|
|
557
|
+
values. If none given (the default), returns an expression that takes the first
|
|
558
|
+
column of the context; otherwise, takes the first value of the given column(s).
|
|
559
|
+
|
|
560
|
+
Parameters
|
|
561
|
+
----------
|
|
562
|
+
*columns
|
|
563
|
+
One or more column names.
|
|
564
|
+
|
|
565
|
+
Examples
|
|
566
|
+
--------
|
|
567
|
+
>>> df = pl.DataFrame(
|
|
568
|
+
... {
|
|
569
|
+
... "a": [1, 8, 3],
|
|
570
|
+
... "b": [4, 5, 2],
|
|
571
|
+
... "c": ["foo", "bar", "baz"],
|
|
572
|
+
... }
|
|
573
|
+
... )
|
|
574
|
+
|
|
575
|
+
Return the first column:
|
|
576
|
+
|
|
577
|
+
>>> df.select(pl.first())
|
|
578
|
+
shape: (3, 1)
|
|
579
|
+
┌─────┐
|
|
580
|
+
│ a │
|
|
581
|
+
│ --- │
|
|
582
|
+
│ i64 │
|
|
583
|
+
╞═════╡
|
|
584
|
+
│ 1 │
|
|
585
|
+
│ 8 │
|
|
586
|
+
│ 3 │
|
|
587
|
+
└─────┘
|
|
588
|
+
|
|
589
|
+
Return the first value for the given column(s):
|
|
590
|
+
|
|
591
|
+
>>> df.select(pl.first("b"))
|
|
592
|
+
shape: (1, 1)
|
|
593
|
+
┌─────┐
|
|
594
|
+
│ b │
|
|
595
|
+
│ --- │
|
|
596
|
+
│ i64 │
|
|
597
|
+
╞═════╡
|
|
598
|
+
│ 4 │
|
|
599
|
+
└─────┘
|
|
600
|
+
>>> df.select(pl.first("a", "c"))
|
|
601
|
+
shape: (1, 2)
|
|
602
|
+
┌─────┬─────┐
|
|
603
|
+
│ a ┆ c │
|
|
604
|
+
│ --- ┆ --- │
|
|
605
|
+
│ i64 ┆ str │
|
|
606
|
+
╞═════╪═════╡
|
|
607
|
+
│ 1 ┆ foo │
|
|
608
|
+
└─────┴─────┘
|
|
609
|
+
|
|
610
|
+
"""
|
|
611
|
+
if not columns:
|
|
612
|
+
return cs.first().as_expr()
|
|
613
|
+
|
|
614
|
+
return F.col(*columns).first()
|
|
615
|
+
|
|
616
|
+
|
|
617
|
+
def last(*columns: str) -> Expr:
|
|
618
|
+
"""
|
|
619
|
+
Get the last column or value.
|
|
620
|
+
|
|
621
|
+
This function has different behavior depending on the presence of `columns`
|
|
622
|
+
values. If none given (the default), returns an expression that takes the last
|
|
623
|
+
column of the context; otherwise, takes the last value of the given column(s).
|
|
624
|
+
|
|
625
|
+
Parameters
|
|
626
|
+
----------
|
|
627
|
+
*columns
|
|
628
|
+
One or more column names.
|
|
629
|
+
|
|
630
|
+
Examples
|
|
631
|
+
--------
|
|
632
|
+
>>> df = pl.DataFrame(
|
|
633
|
+
... {
|
|
634
|
+
... "a": [1, 8, 3],
|
|
635
|
+
... "b": [4, 5, 2],
|
|
636
|
+
... "c": ["foo", "bar", "baz"],
|
|
637
|
+
... }
|
|
638
|
+
... )
|
|
639
|
+
|
|
640
|
+
Return the last column:
|
|
641
|
+
|
|
642
|
+
>>> df.select(pl.last())
|
|
643
|
+
shape: (3, 1)
|
|
644
|
+
┌─────┐
|
|
645
|
+
│ c │
|
|
646
|
+
│ --- │
|
|
647
|
+
│ str │
|
|
648
|
+
╞═════╡
|
|
649
|
+
│ foo │
|
|
650
|
+
│ bar │
|
|
651
|
+
│ baz │
|
|
652
|
+
└─────┘
|
|
653
|
+
|
|
654
|
+
Return the last value for the given column(s):
|
|
655
|
+
|
|
656
|
+
>>> df.select(pl.last("a"))
|
|
657
|
+
shape: (1, 1)
|
|
658
|
+
┌─────┐
|
|
659
|
+
│ a │
|
|
660
|
+
│ --- │
|
|
661
|
+
│ i64 │
|
|
662
|
+
╞═════╡
|
|
663
|
+
│ 3 │
|
|
664
|
+
└─────┘
|
|
665
|
+
>>> df.select(pl.last("b", "c"))
|
|
666
|
+
shape: (1, 2)
|
|
667
|
+
┌─────┬─────┐
|
|
668
|
+
│ b ┆ c │
|
|
669
|
+
│ --- ┆ --- │
|
|
670
|
+
│ i64 ┆ str │
|
|
671
|
+
╞═════╪═════╡
|
|
672
|
+
│ 2 ┆ baz │
|
|
673
|
+
└─────┴─────┘
|
|
674
|
+
|
|
675
|
+
"""
|
|
676
|
+
if not columns:
|
|
677
|
+
return cs.last().as_expr()
|
|
678
|
+
|
|
679
|
+
return F.col(*columns).last()
|
|
680
|
+
|
|
681
|
+
|
|
682
|
+
def nth(*indices: int | Sequence[int], strict: bool = True) -> Expr:
|
|
683
|
+
"""
|
|
684
|
+
Get the nth column(s) of the context.
|
|
685
|
+
|
|
686
|
+
Parameters
|
|
687
|
+
----------
|
|
688
|
+
indices
|
|
689
|
+
One or more indices representing the columns to retrieve.
|
|
690
|
+
|
|
691
|
+
Examples
|
|
692
|
+
--------
|
|
693
|
+
>>> df = pl.DataFrame(
|
|
694
|
+
... {
|
|
695
|
+
... "a": [1, 8, 3],
|
|
696
|
+
... "b": [4, 5, 2],
|
|
697
|
+
... "c": ["foo", "bar", "baz"],
|
|
698
|
+
... }
|
|
699
|
+
... )
|
|
700
|
+
>>> df.select(pl.nth(1))
|
|
701
|
+
shape: (3, 1)
|
|
702
|
+
┌─────┐
|
|
703
|
+
│ b │
|
|
704
|
+
│ --- │
|
|
705
|
+
│ i64 │
|
|
706
|
+
╞═════╡
|
|
707
|
+
│ 4 │
|
|
708
|
+
│ 5 │
|
|
709
|
+
│ 2 │
|
|
710
|
+
└─────┘
|
|
711
|
+
>>> df.select(pl.nth(2, 0))
|
|
712
|
+
shape: (3, 2)
|
|
713
|
+
┌─────┬─────┐
|
|
714
|
+
│ c ┆ a │
|
|
715
|
+
│ --- ┆ --- │
|
|
716
|
+
│ str ┆ i64 │
|
|
717
|
+
╞═════╪═════╡
|
|
718
|
+
│ foo ┆ 1 │
|
|
719
|
+
│ bar ┆ 8 │
|
|
720
|
+
│ baz ┆ 3 │
|
|
721
|
+
└─────┴─────┘
|
|
722
|
+
"""
|
|
723
|
+
return cs.by_index(*indices, require_all=strict).as_expr()
|
|
724
|
+
|
|
725
|
+
|
|
726
|
+
def head(column: str, n: int = 10) -> Expr:
|
|
727
|
+
"""
|
|
728
|
+
Get the first `n` rows.
|
|
729
|
+
|
|
730
|
+
This function is syntactic sugar for `pl.col(column).head(n)`.
|
|
731
|
+
|
|
732
|
+
Parameters
|
|
733
|
+
----------
|
|
734
|
+
column
|
|
735
|
+
Column name.
|
|
736
|
+
n
|
|
737
|
+
Number of rows to return.
|
|
738
|
+
|
|
739
|
+
Examples
|
|
740
|
+
--------
|
|
741
|
+
>>> df = pl.DataFrame(
|
|
742
|
+
... {
|
|
743
|
+
... "a": [1, 8, 3],
|
|
744
|
+
... "b": [4, 5, 2],
|
|
745
|
+
... "c": ["foo", "bar", "foo"],
|
|
746
|
+
... }
|
|
747
|
+
... )
|
|
748
|
+
>>> df.select(pl.head("a"))
|
|
749
|
+
shape: (3, 1)
|
|
750
|
+
┌─────┐
|
|
751
|
+
│ a │
|
|
752
|
+
│ --- │
|
|
753
|
+
│ i64 │
|
|
754
|
+
╞═════╡
|
|
755
|
+
│ 1 │
|
|
756
|
+
│ 8 │
|
|
757
|
+
│ 3 │
|
|
758
|
+
└─────┘
|
|
759
|
+
>>> df.select(pl.head("a", 2))
|
|
760
|
+
shape: (2, 1)
|
|
761
|
+
┌─────┐
|
|
762
|
+
│ a │
|
|
763
|
+
│ --- │
|
|
764
|
+
│ i64 │
|
|
765
|
+
╞═════╡
|
|
766
|
+
│ 1 │
|
|
767
|
+
│ 8 │
|
|
768
|
+
└─────┘
|
|
769
|
+
"""
|
|
770
|
+
return F.col(column).head(n)
|
|
771
|
+
|
|
772
|
+
|
|
773
|
+
def tail(column: str, n: int = 10) -> Expr:
|
|
774
|
+
"""
|
|
775
|
+
Get the last `n` rows.
|
|
776
|
+
|
|
777
|
+
This function is syntactic sugar for `pl.col(column).tail(n)`.
|
|
778
|
+
|
|
779
|
+
Parameters
|
|
780
|
+
----------
|
|
781
|
+
column
|
|
782
|
+
Column name.
|
|
783
|
+
n
|
|
784
|
+
Number of rows to return.
|
|
785
|
+
|
|
786
|
+
Examples
|
|
787
|
+
--------
|
|
788
|
+
>>> df = pl.DataFrame(
|
|
789
|
+
... {
|
|
790
|
+
... "a": [1, 8, 3],
|
|
791
|
+
... "b": [4, 5, 2],
|
|
792
|
+
... "c": ["foo", "bar", "foo"],
|
|
793
|
+
... }
|
|
794
|
+
... )
|
|
795
|
+
>>> df.select(pl.tail("a"))
|
|
796
|
+
shape: (3, 1)
|
|
797
|
+
┌─────┐
|
|
798
|
+
│ a │
|
|
799
|
+
│ --- │
|
|
800
|
+
│ i64 │
|
|
801
|
+
╞═════╡
|
|
802
|
+
│ 1 │
|
|
803
|
+
│ 8 │
|
|
804
|
+
│ 3 │
|
|
805
|
+
└─────┘
|
|
806
|
+
>>> df.select(pl.tail("a", 2))
|
|
807
|
+
shape: (2, 1)
|
|
808
|
+
┌─────┐
|
|
809
|
+
│ a │
|
|
810
|
+
│ --- │
|
|
811
|
+
│ i64 │
|
|
812
|
+
╞═════╡
|
|
813
|
+
│ 8 │
|
|
814
|
+
│ 3 │
|
|
815
|
+
└─────┘
|
|
816
|
+
"""
|
|
817
|
+
return F.col(column).tail(n)
|
|
818
|
+
|
|
819
|
+
|
|
820
|
+
@overload
|
|
821
|
+
def corr(
|
|
822
|
+
a: IntoExpr,
|
|
823
|
+
b: IntoExpr,
|
|
824
|
+
*,
|
|
825
|
+
method: CorrelationMethod = ...,
|
|
826
|
+
ddof: int | None = ...,
|
|
827
|
+
propagate_nans: bool = ...,
|
|
828
|
+
eager: Literal[False] = ...,
|
|
829
|
+
) -> Expr: ...
|
|
830
|
+
|
|
831
|
+
|
|
832
|
+
@overload
|
|
833
|
+
def corr(
|
|
834
|
+
a: IntoExpr,
|
|
835
|
+
b: IntoExpr,
|
|
836
|
+
*,
|
|
837
|
+
method: CorrelationMethod = ...,
|
|
838
|
+
ddof: int | None = ...,
|
|
839
|
+
propagate_nans: bool = ...,
|
|
840
|
+
eager: Literal[True],
|
|
841
|
+
) -> Series: ...
|
|
842
|
+
|
|
843
|
+
|
|
844
|
+
def corr(
|
|
845
|
+
a: IntoExpr,
|
|
846
|
+
b: IntoExpr,
|
|
847
|
+
*,
|
|
848
|
+
method: CorrelationMethod = "pearson",
|
|
849
|
+
ddof: int | None = None,
|
|
850
|
+
propagate_nans: bool = False,
|
|
851
|
+
eager: bool = False,
|
|
852
|
+
) -> Expr | Series:
|
|
853
|
+
"""
|
|
854
|
+
Compute the Pearson's or Spearman rank correlation between two columns.
|
|
855
|
+
|
|
856
|
+
Parameters
|
|
857
|
+
----------
|
|
858
|
+
a
|
|
859
|
+
Column name or Expression.
|
|
860
|
+
b
|
|
861
|
+
Column name or Expression.
|
|
862
|
+
ddof
|
|
863
|
+
Has no effect, do not use.
|
|
864
|
+
|
|
865
|
+
.. deprecated:: 1.17.0
|
|
866
|
+
|
|
867
|
+
method : {'pearson', 'spearman'}
|
|
868
|
+
Correlation method.
|
|
869
|
+
propagate_nans
|
|
870
|
+
If `True` any `NaN` encountered will lead to `NaN` in the output.
|
|
871
|
+
Defaults to `False` where `NaN` are regarded as larger than any finite number
|
|
872
|
+
and thus lead to the highest rank.
|
|
873
|
+
eager
|
|
874
|
+
Evaluate immediately and return a `Series`; this requires that at least one
|
|
875
|
+
of the given arguments is a `Series`. If set to `False` (default), return
|
|
876
|
+
an expression instead.
|
|
877
|
+
|
|
878
|
+
Examples
|
|
879
|
+
--------
|
|
880
|
+
Pearson's correlation:
|
|
881
|
+
|
|
882
|
+
>>> df = pl.DataFrame(
|
|
883
|
+
... {
|
|
884
|
+
... "a": [1, 8, 3],
|
|
885
|
+
... "b": [4, 5, 2],
|
|
886
|
+
... "c": ["foo", "bar", "foo"],
|
|
887
|
+
... }
|
|
888
|
+
... )
|
|
889
|
+
>>> df.select(pl.corr("a", "b"))
|
|
890
|
+
shape: (1, 1)
|
|
891
|
+
┌──────────┐
|
|
892
|
+
│ a │
|
|
893
|
+
│ --- │
|
|
894
|
+
│ f64 │
|
|
895
|
+
╞══════════╡
|
|
896
|
+
│ 0.544705 │
|
|
897
|
+
└──────────┘
|
|
898
|
+
|
|
899
|
+
Spearman rank correlation:
|
|
900
|
+
|
|
901
|
+
>>> df.select(pl.corr("a", "b", method="spearman"))
|
|
902
|
+
shape: (1, 1)
|
|
903
|
+
┌─────┐
|
|
904
|
+
│ a │
|
|
905
|
+
│ --- │
|
|
906
|
+
│ f64 │
|
|
907
|
+
╞═════╡
|
|
908
|
+
│ 0.5 │
|
|
909
|
+
└─────┘
|
|
910
|
+
|
|
911
|
+
Eager evaluation:
|
|
912
|
+
|
|
913
|
+
>>> s1 = pl.Series("a", [1, 8, 3])
|
|
914
|
+
>>> s2 = pl.Series("b", [4, 5, 2])
|
|
915
|
+
>>> pl.corr(s1, s2, eager=True)
|
|
916
|
+
shape: (1,)
|
|
917
|
+
Series: 'a' [f64]
|
|
918
|
+
[
|
|
919
|
+
0.544705
|
|
920
|
+
]
|
|
921
|
+
>>> pl.corr(s1, s2, method="spearman", eager=True)
|
|
922
|
+
shape: (1,)
|
|
923
|
+
Series: 'a' [f64]
|
|
924
|
+
[
|
|
925
|
+
0.5
|
|
926
|
+
]
|
|
927
|
+
"""
|
|
928
|
+
if ddof is not None:
|
|
929
|
+
issue_deprecation_warning(
|
|
930
|
+
"the `ddof` parameter has no effect. Do not use it.",
|
|
931
|
+
version="1.17.0",
|
|
932
|
+
)
|
|
933
|
+
|
|
934
|
+
if eager:
|
|
935
|
+
if not (isinstance(a, pl.Series) or isinstance(b, pl.Series)):
|
|
936
|
+
msg = "expected at least one Series in 'corr' inputs if 'eager=True'"
|
|
937
|
+
raise ValueError(msg)
|
|
938
|
+
|
|
939
|
+
frame = pl.DataFrame([e for e in (a, b) if isinstance(e, pl.Series)])
|
|
940
|
+
exprs = ((e.name if isinstance(e, pl.Series) else e) for e in (a, b))
|
|
941
|
+
return frame.select(
|
|
942
|
+
corr(*exprs, eager=False, method=method, propagate_nans=propagate_nans)
|
|
943
|
+
).to_series()
|
|
944
|
+
else:
|
|
945
|
+
a_pyexpr = parse_into_expression(a)
|
|
946
|
+
b_pyexpr = parse_into_expression(b)
|
|
947
|
+
|
|
948
|
+
if method == "pearson":
|
|
949
|
+
return wrap_expr(plr.pearson_corr(a_pyexpr, b_pyexpr))
|
|
950
|
+
elif method == "spearman":
|
|
951
|
+
return wrap_expr(plr.spearman_rank_corr(a_pyexpr, b_pyexpr, propagate_nans))
|
|
952
|
+
else:
|
|
953
|
+
msg = f"method must be one of {{'pearson', 'spearman'}}, got {method!r}"
|
|
954
|
+
raise ValueError(msg)
|
|
955
|
+
|
|
956
|
+
|
|
957
|
+
@overload
|
|
958
|
+
def cov(
|
|
959
|
+
a: IntoExpr,
|
|
960
|
+
b: IntoExpr,
|
|
961
|
+
*,
|
|
962
|
+
ddof: int = ...,
|
|
963
|
+
eager: Literal[False] = ...,
|
|
964
|
+
) -> Expr: ...
|
|
965
|
+
|
|
966
|
+
|
|
967
|
+
@overload
|
|
968
|
+
def cov(
|
|
969
|
+
a: IntoExpr,
|
|
970
|
+
b: IntoExpr,
|
|
971
|
+
*,
|
|
972
|
+
ddof: int = ...,
|
|
973
|
+
eager: Literal[True],
|
|
974
|
+
) -> Series: ...
|
|
975
|
+
|
|
976
|
+
|
|
977
|
+
def cov(
|
|
978
|
+
a: IntoExpr,
|
|
979
|
+
b: IntoExpr,
|
|
980
|
+
*,
|
|
981
|
+
ddof: int = 1,
|
|
982
|
+
eager: bool = False,
|
|
983
|
+
) -> Expr | Series:
|
|
984
|
+
"""
|
|
985
|
+
Compute the covariance between two columns/ expressions.
|
|
986
|
+
|
|
987
|
+
Parameters
|
|
988
|
+
----------
|
|
989
|
+
a
|
|
990
|
+
Column name or Expression.
|
|
991
|
+
b
|
|
992
|
+
Column name or Expression.
|
|
993
|
+
ddof
|
|
994
|
+
"Delta Degrees of Freedom": the divisor used in the calculation is N - ddof,
|
|
995
|
+
where N represents the number of elements.
|
|
996
|
+
By default ddof is 1.
|
|
997
|
+
eager
|
|
998
|
+
Evaluate immediately and return a `Series`; this requires that at least one
|
|
999
|
+
of the given arguments is a `Series`. If set to `False` (default), return
|
|
1000
|
+
an expression instead.
|
|
1001
|
+
|
|
1002
|
+
Examples
|
|
1003
|
+
--------
|
|
1004
|
+
>>> df = pl.DataFrame(
|
|
1005
|
+
... {
|
|
1006
|
+
... "a": [1, 8, 3],
|
|
1007
|
+
... "b": [4, 5, 2],
|
|
1008
|
+
... "c": ["foo", "bar", "foo"],
|
|
1009
|
+
... },
|
|
1010
|
+
... )
|
|
1011
|
+
|
|
1012
|
+
>>> df.select(
|
|
1013
|
+
... x=pl.cov("a", "b"),
|
|
1014
|
+
... y=pl.cov("a", "b", ddof=2),
|
|
1015
|
+
... )
|
|
1016
|
+
shape: (1, 2)
|
|
1017
|
+
┌─────┬─────┐
|
|
1018
|
+
│ x ┆ y │
|
|
1019
|
+
│ --- ┆ --- │
|
|
1020
|
+
│ f64 ┆ f64 │
|
|
1021
|
+
╞═════╪═════╡
|
|
1022
|
+
│ 3.0 ┆ 6.0 │
|
|
1023
|
+
└─────┴─────┘
|
|
1024
|
+
|
|
1025
|
+
Eager evaluation:
|
|
1026
|
+
|
|
1027
|
+
>>> s1 = pl.Series("a", [1, 8, 3])
|
|
1028
|
+
>>> s2 = pl.Series("b", [4, 5, 2])
|
|
1029
|
+
>>> pl.cov(s1, s2, eager=True)
|
|
1030
|
+
shape: (1,)
|
|
1031
|
+
Series: 'a' [f64]
|
|
1032
|
+
[
|
|
1033
|
+
3.0
|
|
1034
|
+
]
|
|
1035
|
+
"""
|
|
1036
|
+
if eager:
|
|
1037
|
+
if not (isinstance(a, pl.Series) or isinstance(b, pl.Series)):
|
|
1038
|
+
msg = "expected at least one Series in 'cov' inputs if 'eager=True'"
|
|
1039
|
+
raise ValueError(msg)
|
|
1040
|
+
|
|
1041
|
+
frame = pl.DataFrame([e for e in (a, b) if isinstance(e, pl.Series)])
|
|
1042
|
+
exprs = ((e.name if isinstance(e, pl.Series) else e) for e in (a, b))
|
|
1043
|
+
return frame.select(cov(*exprs, eager=False, ddof=ddof)).to_series()
|
|
1044
|
+
else:
|
|
1045
|
+
a_pyexpr = parse_into_expression(a)
|
|
1046
|
+
b_pyexpr = parse_into_expression(b)
|
|
1047
|
+
return wrap_expr(plr.cov(a_pyexpr, b_pyexpr, ddof))
|
|
1048
|
+
|
|
1049
|
+
|
|
1050
|
+
class _map_batches_wrapper:
|
|
1051
|
+
def __init__(
|
|
1052
|
+
self,
|
|
1053
|
+
function: Callable[[Sequence[Series]], Series | Any],
|
|
1054
|
+
*,
|
|
1055
|
+
returns_scalar: bool,
|
|
1056
|
+
) -> None:
|
|
1057
|
+
self.function = function
|
|
1058
|
+
self.returns_scalar = returns_scalar
|
|
1059
|
+
|
|
1060
|
+
def __call__(
|
|
1061
|
+
self, sl: list[plr.PySeries], *args: Any, **kwargs: Any
|
|
1062
|
+
) -> plr.PySeries:
|
|
1063
|
+
return_dtype = kwargs["return_dtype"]
|
|
1064
|
+
slp = [wrap_s(s) for s in sl]
|
|
1065
|
+
|
|
1066
|
+
# ufunc and numba don't expect return_dtype
|
|
1067
|
+
try:
|
|
1068
|
+
rv = self.function(slp, *args, **kwargs)
|
|
1069
|
+
except TypeError as e:
|
|
1070
|
+
if "unexpected keyword argument 'return_dtype'" in e.args[0]:
|
|
1071
|
+
kwargs.pop("return_dtype")
|
|
1072
|
+
rv = self.function(slp, *args, **kwargs)
|
|
1073
|
+
else:
|
|
1074
|
+
raise
|
|
1075
|
+
|
|
1076
|
+
if _check_for_numpy(rv) and isinstance(rv, np.ndarray):
|
|
1077
|
+
rv = pl.Series(rv, dtype=return_dtype)
|
|
1078
|
+
|
|
1079
|
+
if isinstance(rv, pl.Series):
|
|
1080
|
+
return rv._s
|
|
1081
|
+
elif self.returns_scalar:
|
|
1082
|
+
return pl.Series([rv], dtype=return_dtype)._s
|
|
1083
|
+
else:
|
|
1084
|
+
msg = f"`map` with `returns_scalar=False` must return a Series; found {qualified_type_name(rv)!r}.\n\nIf `returns_scalar` is set to `True`, a returned value can be a scalar value."
|
|
1085
|
+
raise TypeError(msg)
|
|
1086
|
+
|
|
1087
|
+
|
|
1088
|
+
def map_batches(
|
|
1089
|
+
exprs: Sequence[str | Expr],
|
|
1090
|
+
function: Callable[[Sequence[Series]], Series | Any],
|
|
1091
|
+
return_dtype: PolarsDataType | pl.DataTypeExpr | None = None,
|
|
1092
|
+
*,
|
|
1093
|
+
is_elementwise: bool = False,
|
|
1094
|
+
returns_scalar: bool = False,
|
|
1095
|
+
) -> Expr:
|
|
1096
|
+
"""
|
|
1097
|
+
Map a custom function over multiple columns/expressions.
|
|
1098
|
+
|
|
1099
|
+
Produces a single Series result.
|
|
1100
|
+
|
|
1101
|
+
.. warning::
|
|
1102
|
+
This method is much slower than the native expressions API.
|
|
1103
|
+
Only use it if you cannot implement your logic otherwise.
|
|
1104
|
+
|
|
1105
|
+
Parameters
|
|
1106
|
+
----------
|
|
1107
|
+
exprs
|
|
1108
|
+
Expression(s) representing the input Series to the function.
|
|
1109
|
+
function
|
|
1110
|
+
Function to apply over the input.
|
|
1111
|
+
return_dtype
|
|
1112
|
+
Datatype of the output Series.
|
|
1113
|
+
|
|
1114
|
+
It is recommended to set this whenever possible. If this is `None`, it tries
|
|
1115
|
+
to infer the datatype by calling the function with dummy data and looking at
|
|
1116
|
+
the output.
|
|
1117
|
+
is_elementwise
|
|
1118
|
+
Set to true if the operations is elementwise for better performance
|
|
1119
|
+
and optimization.
|
|
1120
|
+
|
|
1121
|
+
An elementwise operations has unit or equal length for all inputs
|
|
1122
|
+
and can be ran sequentially on slices without results being affected.
|
|
1123
|
+
returns_scalar
|
|
1124
|
+
If the function returns a scalar, by default it will be wrapped in
|
|
1125
|
+
a list in the output, since the assumption is that the function
|
|
1126
|
+
always returns something Series-like. If you want to keep the
|
|
1127
|
+
result as a scalar, set this argument to True.
|
|
1128
|
+
|
|
1129
|
+
Notes
|
|
1130
|
+
-----
|
|
1131
|
+
A UDF passed to `map_batches` must be pure, meaning that it cannot modify
|
|
1132
|
+
or depend on state other than its arguments. We may call the function
|
|
1133
|
+
with arbitrary input data.
|
|
1134
|
+
|
|
1135
|
+
Returns
|
|
1136
|
+
-------
|
|
1137
|
+
Expr
|
|
1138
|
+
Expression with the data type given by `return_dtype`.
|
|
1139
|
+
|
|
1140
|
+
Examples
|
|
1141
|
+
--------
|
|
1142
|
+
>>> def test_func(a, b, c):
|
|
1143
|
+
... return a + b + c
|
|
1144
|
+
>>> df = pl.DataFrame(
|
|
1145
|
+
... {
|
|
1146
|
+
... "a": [1, 2, 3, 4],
|
|
1147
|
+
... "b": [4, 5, 6, 7],
|
|
1148
|
+
... }
|
|
1149
|
+
... )
|
|
1150
|
+
>>>
|
|
1151
|
+
>>> df.with_columns(
|
|
1152
|
+
... (
|
|
1153
|
+
... pl.struct(["a", "b"]).map_batches(
|
|
1154
|
+
... lambda x: test_func(x.struct.field("a"), x.struct.field("b"), 1)
|
|
1155
|
+
... )
|
|
1156
|
+
... ).alias("a+b+c")
|
|
1157
|
+
... )
|
|
1158
|
+
shape: (4, 3)
|
|
1159
|
+
┌─────┬─────┬───────┐
|
|
1160
|
+
│ a ┆ b ┆ a+b+c │
|
|
1161
|
+
│ --- ┆ --- ┆ --- │
|
|
1162
|
+
│ i64 ┆ i64 ┆ i64 │
|
|
1163
|
+
╞═════╪═════╪═══════╡
|
|
1164
|
+
│ 1 ┆ 4 ┆ 6 │
|
|
1165
|
+
│ 2 ┆ 5 ┆ 8 │
|
|
1166
|
+
│ 3 ┆ 6 ┆ 10 │
|
|
1167
|
+
│ 4 ┆ 7 ┆ 12 │
|
|
1168
|
+
└─────┴─────┴───────┘
|
|
1169
|
+
"""
|
|
1170
|
+
pyexprs = parse_into_list_of_expressions(exprs)
|
|
1171
|
+
|
|
1172
|
+
return_dtype_expr = (
|
|
1173
|
+
parse_into_datatype_expr(return_dtype)._pydatatype_expr
|
|
1174
|
+
if return_dtype is not None
|
|
1175
|
+
else None
|
|
1176
|
+
)
|
|
1177
|
+
|
|
1178
|
+
return wrap_expr(
|
|
1179
|
+
plr.map_expr(
|
|
1180
|
+
pyexprs,
|
|
1181
|
+
_map_batches_wrapper(function, returns_scalar=returns_scalar),
|
|
1182
|
+
return_dtype_expr,
|
|
1183
|
+
is_elementwise=is_elementwise,
|
|
1184
|
+
returns_scalar=returns_scalar,
|
|
1185
|
+
)
|
|
1186
|
+
)
|
|
1187
|
+
|
|
1188
|
+
|
|
1189
|
+
def map_groups(
|
|
1190
|
+
exprs: Sequence[str | Expr],
|
|
1191
|
+
function: Callable[[Sequence[Series]], Series | Any],
|
|
1192
|
+
return_dtype: PolarsDataType | pl.DataTypeExpr | None = None,
|
|
1193
|
+
*,
|
|
1194
|
+
is_elementwise: bool = False,
|
|
1195
|
+
returns_scalar: bool = False,
|
|
1196
|
+
) -> Expr:
|
|
1197
|
+
"""
|
|
1198
|
+
Apply a custom/user-defined function (UDF) in a GroupBy context.
|
|
1199
|
+
|
|
1200
|
+
.. warning::
|
|
1201
|
+
This method is much slower than the native expressions API.
|
|
1202
|
+
Only use it if you cannot implement your logic otherwise.
|
|
1203
|
+
|
|
1204
|
+
Parameters
|
|
1205
|
+
----------
|
|
1206
|
+
exprs
|
|
1207
|
+
Expression(s) representing the input Series to the function.
|
|
1208
|
+
function
|
|
1209
|
+
Function to apply over the input; should be of type Callable[[Series], Series].
|
|
1210
|
+
return_dtype
|
|
1211
|
+
Datatype of the output Series.
|
|
1212
|
+
|
|
1213
|
+
It is recommended to set this whenever possible. If this is `None`, it tries
|
|
1214
|
+
to infer the datatype by calling the function with dummy data and looking at
|
|
1215
|
+
the output.
|
|
1216
|
+
is_elementwise
|
|
1217
|
+
Set to true if the operations is elementwise for better performance
|
|
1218
|
+
and optimization.
|
|
1219
|
+
|
|
1220
|
+
An elementwise operations has unit or equal length for all inputs
|
|
1221
|
+
and can be ran sequentially on slices without results being affected.
|
|
1222
|
+
returns_scalar
|
|
1223
|
+
If the function returns a single scalar as output.
|
|
1224
|
+
|
|
1225
|
+
Notes
|
|
1226
|
+
-----
|
|
1227
|
+
A UDF passed to `map_batches` must be pure, meaning that it cannot modify
|
|
1228
|
+
or depend on state other than its arguments. Polars may call the function
|
|
1229
|
+
with arbitrary input data.
|
|
1230
|
+
|
|
1231
|
+
Returns
|
|
1232
|
+
-------
|
|
1233
|
+
Expr
|
|
1234
|
+
Expression with the data type given by `return_dtype`.
|
|
1235
|
+
|
|
1236
|
+
Examples
|
|
1237
|
+
--------
|
|
1238
|
+
>>> df = pl.DataFrame(
|
|
1239
|
+
... {
|
|
1240
|
+
... "group": [1, 1, 2],
|
|
1241
|
+
... "a": [1, 3, 3],
|
|
1242
|
+
... "b": [5, 6, 7],
|
|
1243
|
+
... }
|
|
1244
|
+
... )
|
|
1245
|
+
>>> df
|
|
1246
|
+
shape: (3, 3)
|
|
1247
|
+
┌───────┬─────┬─────┐
|
|
1248
|
+
│ group ┆ a ┆ b │
|
|
1249
|
+
│ --- ┆ --- ┆ --- │
|
|
1250
|
+
│ i64 ┆ i64 ┆ i64 │
|
|
1251
|
+
╞═══════╪═════╪═════╡
|
|
1252
|
+
│ 1 ┆ 1 ┆ 5 │
|
|
1253
|
+
│ 1 ┆ 3 ┆ 6 │
|
|
1254
|
+
│ 2 ┆ 3 ┆ 7 │
|
|
1255
|
+
└───────┴─────┴─────┘
|
|
1256
|
+
>>> (
|
|
1257
|
+
... df.group_by("group").agg(
|
|
1258
|
+
... pl.map_groups(
|
|
1259
|
+
... exprs=["a", "b"],
|
|
1260
|
+
... function=lambda list_of_series: list_of_series[0]
|
|
1261
|
+
... / list_of_series[0].sum()
|
|
1262
|
+
... + list_of_series[1],
|
|
1263
|
+
... return_dtype=pl.Float64,
|
|
1264
|
+
... ).alias("my_custom_aggregation")
|
|
1265
|
+
... )
|
|
1266
|
+
... ).sort("group")
|
|
1267
|
+
shape: (2, 2)
|
|
1268
|
+
┌───────┬───────────────────────┐
|
|
1269
|
+
│ group ┆ my_custom_aggregation │
|
|
1270
|
+
│ --- ┆ --- │
|
|
1271
|
+
│ i64 ┆ list[f64] │
|
|
1272
|
+
╞═══════╪═══════════════════════╡
|
|
1273
|
+
│ 1 ┆ [5.25, 6.75] │
|
|
1274
|
+
│ 2 ┆ [8.0] │
|
|
1275
|
+
└───────┴───────────────────────┘
|
|
1276
|
+
|
|
1277
|
+
The output for group `1` can be understood as follows:
|
|
1278
|
+
|
|
1279
|
+
- group `1` contains Series `'a': [1, 3]` and `'b': [5, 6]`
|
|
1280
|
+
- applying the function to those lists of Series, one gets the output
|
|
1281
|
+
`[1 / 4 + 5, 3 / 4 + 6]`, i.e. `[5.25, 6.75]`
|
|
1282
|
+
"""
|
|
1283
|
+
return map_batches(
|
|
1284
|
+
exprs,
|
|
1285
|
+
function,
|
|
1286
|
+
return_dtype,
|
|
1287
|
+
is_elementwise=is_elementwise,
|
|
1288
|
+
returns_scalar=returns_scalar,
|
|
1289
|
+
)
|
|
1290
|
+
|
|
1291
|
+
|
|
1292
|
+
def _row_encode(
|
|
1293
|
+
exprs: pl.Selector | pl.Expr | Sequence[str | pl.Expr],
|
|
1294
|
+
*,
|
|
1295
|
+
unordered: bool = False,
|
|
1296
|
+
descending: list[bool] | None = None,
|
|
1297
|
+
nulls_last: list[bool] | None = None,
|
|
1298
|
+
) -> Expr:
|
|
1299
|
+
if isinstance(exprs, pl.Selector):
|
|
1300
|
+
exprs = [exprs.as_expr()]
|
|
1301
|
+
elif isinstance(exprs, pl.Expr):
|
|
1302
|
+
exprs = [exprs]
|
|
1303
|
+
|
|
1304
|
+
pyexprs = parse_into_list_of_expressions(exprs)
|
|
1305
|
+
|
|
1306
|
+
if unordered:
|
|
1307
|
+
assert descending is None
|
|
1308
|
+
assert nulls_last is None
|
|
1309
|
+
|
|
1310
|
+
result = plr.PyExpr.row_encode_unordered(pyexprs)
|
|
1311
|
+
else:
|
|
1312
|
+
result = plr.PyExpr.row_encode_ordered(pyexprs, descending, nulls_last)
|
|
1313
|
+
|
|
1314
|
+
return wrap_expr(result)
|
|
1315
|
+
|
|
1316
|
+
|
|
1317
|
+
def _wrap_acc_lamba(
|
|
1318
|
+
function: Callable[[Series, Series], Series],
|
|
1319
|
+
) -> Callable[[tuple[plr.PySeries, plr.PySeries]], plr.PySeries]:
|
|
1320
|
+
def wrapper(t: tuple[plr.PySeries, plr.PySeries]) -> plr.PySeries:
|
|
1321
|
+
a, b = t
|
|
1322
|
+
return function(wrap_s(a), wrap_s(b))._s
|
|
1323
|
+
|
|
1324
|
+
return wrapper
|
|
1325
|
+
|
|
1326
|
+
|
|
1327
|
+
def fold(
|
|
1328
|
+
acc: IntoExpr,
|
|
1329
|
+
function: Callable[[Series, Series], Series],
|
|
1330
|
+
exprs: Sequence[Expr | str] | Expr,
|
|
1331
|
+
*,
|
|
1332
|
+
returns_scalar: bool = False,
|
|
1333
|
+
return_dtype: pl.DataTypeExpr | PolarsDataType | None = None,
|
|
1334
|
+
) -> Expr:
|
|
1335
|
+
"""
|
|
1336
|
+
Accumulate over multiple columns horizontally/ row wise with a left fold.
|
|
1337
|
+
|
|
1338
|
+
Parameters
|
|
1339
|
+
----------
|
|
1340
|
+
acc
|
|
1341
|
+
Accumulator Expression. This is the value that will be initialized when the fold
|
|
1342
|
+
starts. For a sum this could for instance be lit(0).
|
|
1343
|
+
function
|
|
1344
|
+
Function to apply over the accumulator and the value.
|
|
1345
|
+
Fn(acc, value) -> new_value
|
|
1346
|
+
exprs
|
|
1347
|
+
Expressions to aggregate over. May also be a wildcard expression.
|
|
1348
|
+
returns_scalar
|
|
1349
|
+
Whether or not `function` applied returns a scalar. This must be set correctly
|
|
1350
|
+
by the user.
|
|
1351
|
+
return_dtype
|
|
1352
|
+
Output datatype.
|
|
1353
|
+
If not set, the dtype will be inferred based on the dtype
|
|
1354
|
+
of the accumulator.
|
|
1355
|
+
|
|
1356
|
+
Notes
|
|
1357
|
+
-----
|
|
1358
|
+
If you simply want the first encountered expression as accumulator,
|
|
1359
|
+
consider using `reduce`.
|
|
1360
|
+
|
|
1361
|
+
Examples
|
|
1362
|
+
--------
|
|
1363
|
+
>>> df = pl.DataFrame(
|
|
1364
|
+
... {
|
|
1365
|
+
... "a": [1, 2, 3],
|
|
1366
|
+
... "b": [3, 4, 5],
|
|
1367
|
+
... "c": [5, 6, 7],
|
|
1368
|
+
... }
|
|
1369
|
+
... )
|
|
1370
|
+
>>> df
|
|
1371
|
+
shape: (3, 3)
|
|
1372
|
+
┌─────┬─────┬─────┐
|
|
1373
|
+
│ a ┆ b ┆ c │
|
|
1374
|
+
│ --- ┆ --- ┆ --- │
|
|
1375
|
+
│ i64 ┆ i64 ┆ i64 │
|
|
1376
|
+
╞═════╪═════╪═════╡
|
|
1377
|
+
│ 1 ┆ 3 ┆ 5 │
|
|
1378
|
+
│ 2 ┆ 4 ┆ 6 │
|
|
1379
|
+
│ 3 ┆ 5 ┆ 7 │
|
|
1380
|
+
└─────┴─────┴─────┘
|
|
1381
|
+
|
|
1382
|
+
Horizontally sum over all columns and add 1.
|
|
1383
|
+
|
|
1384
|
+
>>> df.select(
|
|
1385
|
+
... pl.fold(
|
|
1386
|
+
... acc=pl.lit(1), function=lambda acc, x: acc + x, exprs=pl.col("*")
|
|
1387
|
+
... ).alias("sum"),
|
|
1388
|
+
... )
|
|
1389
|
+
shape: (3, 1)
|
|
1390
|
+
┌─────┐
|
|
1391
|
+
│ sum │
|
|
1392
|
+
│ --- │
|
|
1393
|
+
│ i32 │
|
|
1394
|
+
╞═════╡
|
|
1395
|
+
│ 10 │
|
|
1396
|
+
│ 13 │
|
|
1397
|
+
│ 16 │
|
|
1398
|
+
└─────┘
|
|
1399
|
+
|
|
1400
|
+
You can also apply a condition/predicate on all columns:
|
|
1401
|
+
|
|
1402
|
+
>>> df = pl.DataFrame(
|
|
1403
|
+
... {
|
|
1404
|
+
... "a": [1, 2, 3],
|
|
1405
|
+
... "b": [0, 1, 2],
|
|
1406
|
+
... }
|
|
1407
|
+
... )
|
|
1408
|
+
>>> df
|
|
1409
|
+
shape: (3, 2)
|
|
1410
|
+
┌─────┬─────┐
|
|
1411
|
+
│ a ┆ b │
|
|
1412
|
+
│ --- ┆ --- │
|
|
1413
|
+
│ i64 ┆ i64 │
|
|
1414
|
+
╞═════╪═════╡
|
|
1415
|
+
│ 1 ┆ 0 │
|
|
1416
|
+
│ 2 ┆ 1 │
|
|
1417
|
+
│ 3 ┆ 2 │
|
|
1418
|
+
└─────┴─────┘
|
|
1419
|
+
|
|
1420
|
+
>>> df.filter(
|
|
1421
|
+
... pl.fold(
|
|
1422
|
+
... acc=pl.lit(True),
|
|
1423
|
+
... function=lambda acc, x: acc & x,
|
|
1424
|
+
... exprs=pl.col("*") > 1,
|
|
1425
|
+
... )
|
|
1426
|
+
... )
|
|
1427
|
+
shape: (1, 2)
|
|
1428
|
+
┌─────┬─────┐
|
|
1429
|
+
│ a ┆ b │
|
|
1430
|
+
│ --- ┆ --- │
|
|
1431
|
+
│ i64 ┆ i64 │
|
|
1432
|
+
╞═════╪═════╡
|
|
1433
|
+
│ 3 ┆ 2 │
|
|
1434
|
+
└─────┴─────┘
|
|
1435
|
+
"""
|
|
1436
|
+
# in case of col("*")
|
|
1437
|
+
pyacc = parse_into_expression(acc, str_as_lit=True)
|
|
1438
|
+
if isinstance(exprs, pl.Expr):
|
|
1439
|
+
exprs = [exprs]
|
|
1440
|
+
|
|
1441
|
+
rt: plr.PyDataTypeExpr | None = None
|
|
1442
|
+
if return_dtype is not None:
|
|
1443
|
+
rt = parse_into_datatype_expr(return_dtype)._pydatatype_expr
|
|
1444
|
+
|
|
1445
|
+
pyexprs = parse_into_list_of_expressions(exprs)
|
|
1446
|
+
return wrap_expr(
|
|
1447
|
+
plr.fold(
|
|
1448
|
+
pyacc,
|
|
1449
|
+
_wrap_acc_lamba(function),
|
|
1450
|
+
pyexprs,
|
|
1451
|
+
returns_scalar=returns_scalar,
|
|
1452
|
+
return_dtype=rt,
|
|
1453
|
+
)
|
|
1454
|
+
)
|
|
1455
|
+
|
|
1456
|
+
|
|
1457
|
+
def reduce(
|
|
1458
|
+
function: Callable[[Series, Series], Series],
|
|
1459
|
+
exprs: Sequence[Expr | str] | Expr,
|
|
1460
|
+
*,
|
|
1461
|
+
returns_scalar: bool = False,
|
|
1462
|
+
return_dtype: pl.DataTypeExpr | PolarsDataType | None = None,
|
|
1463
|
+
) -> Expr:
|
|
1464
|
+
"""
|
|
1465
|
+
Accumulate over multiple columns horizontally/ row wise with a left fold.
|
|
1466
|
+
|
|
1467
|
+
Parameters
|
|
1468
|
+
----------
|
|
1469
|
+
function
|
|
1470
|
+
Function to apply over the accumulator and the value.
|
|
1471
|
+
Fn(acc, value) -> new_value
|
|
1472
|
+
exprs
|
|
1473
|
+
Expressions to aggregate over. May also be a wildcard expression.
|
|
1474
|
+
returns_scalar
|
|
1475
|
+
Whether or not `function` applied returns a scalar. This must be set correctly
|
|
1476
|
+
by the user.
|
|
1477
|
+
return_dtype
|
|
1478
|
+
Output datatype.
|
|
1479
|
+
If not set, the dtype will be inferred based on the dtype of the input
|
|
1480
|
+
expressions.
|
|
1481
|
+
|
|
1482
|
+
Notes
|
|
1483
|
+
-----
|
|
1484
|
+
See `fold` for the version with an explicit accumulator.
|
|
1485
|
+
|
|
1486
|
+
Examples
|
|
1487
|
+
--------
|
|
1488
|
+
>>> df = pl.DataFrame(
|
|
1489
|
+
... {
|
|
1490
|
+
... "a": [1, 2, 3],
|
|
1491
|
+
... "b": [0, 1, 2],
|
|
1492
|
+
... }
|
|
1493
|
+
... )
|
|
1494
|
+
>>> df
|
|
1495
|
+
shape: (3, 2)
|
|
1496
|
+
┌─────┬─────┐
|
|
1497
|
+
│ a ┆ b │
|
|
1498
|
+
│ --- ┆ --- │
|
|
1499
|
+
│ i64 ┆ i64 │
|
|
1500
|
+
╞═════╪═════╡
|
|
1501
|
+
│ 1 ┆ 0 │
|
|
1502
|
+
│ 2 ┆ 1 │
|
|
1503
|
+
│ 3 ┆ 2 │
|
|
1504
|
+
└─────┴─────┘
|
|
1505
|
+
|
|
1506
|
+
Horizontally sum over all columns.
|
|
1507
|
+
|
|
1508
|
+
>>> df.select(
|
|
1509
|
+
... pl.reduce(function=lambda acc, x: acc + x, exprs=pl.col("*")).alias("sum")
|
|
1510
|
+
... )
|
|
1511
|
+
shape: (3, 1)
|
|
1512
|
+
┌─────┐
|
|
1513
|
+
│ sum │
|
|
1514
|
+
│ --- │
|
|
1515
|
+
│ i64 │
|
|
1516
|
+
╞═════╡
|
|
1517
|
+
│ 1 │
|
|
1518
|
+
│ 3 │
|
|
1519
|
+
│ 5 │
|
|
1520
|
+
└─────┘
|
|
1521
|
+
"""
|
|
1522
|
+
if isinstance(exprs, pl.Expr):
|
|
1523
|
+
exprs = [exprs]
|
|
1524
|
+
|
|
1525
|
+
rt: plr.PyDataTypeExpr | None = None
|
|
1526
|
+
if return_dtype is not None:
|
|
1527
|
+
rt = parse_into_datatype_expr(return_dtype)._pydatatype_expr
|
|
1528
|
+
|
|
1529
|
+
pyexprs = parse_into_list_of_expressions(exprs)
|
|
1530
|
+
return wrap_expr(
|
|
1531
|
+
plr.reduce(
|
|
1532
|
+
_wrap_acc_lamba(function),
|
|
1533
|
+
pyexprs,
|
|
1534
|
+
returns_scalar=returns_scalar,
|
|
1535
|
+
return_dtype=rt,
|
|
1536
|
+
)
|
|
1537
|
+
)
|
|
1538
|
+
|
|
1539
|
+
|
|
1540
|
+
def cum_fold(
|
|
1541
|
+
acc: IntoExpr,
|
|
1542
|
+
function: Callable[[Series, Series], Series],
|
|
1543
|
+
exprs: Sequence[Expr | str] | Expr,
|
|
1544
|
+
*,
|
|
1545
|
+
returns_scalar: bool = False,
|
|
1546
|
+
return_dtype: pl.DataTypeExpr | PolarsDataType | None = None,
|
|
1547
|
+
include_init: bool = False,
|
|
1548
|
+
) -> Expr:
|
|
1549
|
+
"""
|
|
1550
|
+
Cumulatively fold horizontally across columns with a left fold.
|
|
1551
|
+
|
|
1552
|
+
Every cumulative result is added as a separate field in a Struct column.
|
|
1553
|
+
|
|
1554
|
+
Parameters
|
|
1555
|
+
----------
|
|
1556
|
+
acc
|
|
1557
|
+
Accumulator expression. This is the value that will be initialized when the fold
|
|
1558
|
+
starts. For a sum this could for instance be lit(0).
|
|
1559
|
+
function
|
|
1560
|
+
Function to apply over the accumulator and the value.
|
|
1561
|
+
Fn(acc, value) -> new_value
|
|
1562
|
+
exprs
|
|
1563
|
+
Expressions to aggregate over. May also be a wildcard expression.
|
|
1564
|
+
returns_scalar
|
|
1565
|
+
Whether or not `function` applied returns a scalar. This must be set correctly
|
|
1566
|
+
by the user.
|
|
1567
|
+
return_dtype
|
|
1568
|
+
Output datatype.
|
|
1569
|
+
If not set, the dtype will be inferred based on the dtype of the accumulator.
|
|
1570
|
+
include_init
|
|
1571
|
+
Include the initial accumulator state as struct field.
|
|
1572
|
+
|
|
1573
|
+
Notes
|
|
1574
|
+
-----
|
|
1575
|
+
If you simply want the first encountered expression as accumulator,
|
|
1576
|
+
consider using :func:`cum_reduce`.
|
|
1577
|
+
|
|
1578
|
+
Examples
|
|
1579
|
+
--------
|
|
1580
|
+
>>> df = pl.DataFrame(
|
|
1581
|
+
... {
|
|
1582
|
+
... "a": [1, 2, 3],
|
|
1583
|
+
... "b": [3, 4, 5],
|
|
1584
|
+
... "c": [5, 6, 7],
|
|
1585
|
+
... }
|
|
1586
|
+
... )
|
|
1587
|
+
>>> df.with_columns(
|
|
1588
|
+
... pl.cum_fold(acc=pl.lit(1), function=lambda acc, x: acc + x, exprs=pl.all())
|
|
1589
|
+
... )
|
|
1590
|
+
shape: (3, 4)
|
|
1591
|
+
┌─────┬─────┬─────┬───────────┐
|
|
1592
|
+
│ a ┆ b ┆ c ┆ cum_fold │
|
|
1593
|
+
│ --- ┆ --- ┆ --- ┆ --- │
|
|
1594
|
+
│ i64 ┆ i64 ┆ i64 ┆ struct[3] │
|
|
1595
|
+
╞═════╪═════╪═════╪═══════════╡
|
|
1596
|
+
│ 1 ┆ 3 ┆ 5 ┆ {2,5,10} │
|
|
1597
|
+
│ 2 ┆ 4 ┆ 6 ┆ {3,7,13} │
|
|
1598
|
+
│ 3 ┆ 5 ┆ 7 ┆ {4,9,16} │
|
|
1599
|
+
└─────┴─────┴─────┴───────────┘
|
|
1600
|
+
"""
|
|
1601
|
+
# in case of col("*")
|
|
1602
|
+
pyacc = parse_into_expression(acc, str_as_lit=True)
|
|
1603
|
+
if isinstance(exprs, pl.Expr):
|
|
1604
|
+
exprs = [exprs]
|
|
1605
|
+
|
|
1606
|
+
rt: plr.PyDataTypeExpr | None = None
|
|
1607
|
+
if return_dtype is not None:
|
|
1608
|
+
rt = parse_into_datatype_expr(return_dtype)._pydatatype_expr
|
|
1609
|
+
|
|
1610
|
+
pyexprs = parse_into_list_of_expressions(exprs)
|
|
1611
|
+
return wrap_expr(
|
|
1612
|
+
plr.cum_fold(
|
|
1613
|
+
pyacc,
|
|
1614
|
+
_wrap_acc_lamba(function),
|
|
1615
|
+
pyexprs,
|
|
1616
|
+
returns_scalar=returns_scalar,
|
|
1617
|
+
return_dtype=rt,
|
|
1618
|
+
include_init=include_init,
|
|
1619
|
+
).alias("cum_fold")
|
|
1620
|
+
)
|
|
1621
|
+
|
|
1622
|
+
|
|
1623
|
+
def cum_reduce(
|
|
1624
|
+
function: Callable[[Series, Series], Series],
|
|
1625
|
+
exprs: Sequence[Expr | str] | Expr,
|
|
1626
|
+
*,
|
|
1627
|
+
returns_scalar: bool = False,
|
|
1628
|
+
return_dtype: pl.DataTypeExpr | PolarsDataType | None = None,
|
|
1629
|
+
) -> Expr:
|
|
1630
|
+
"""
|
|
1631
|
+
Cumulatively reduce horizontally across columns with a left fold.
|
|
1632
|
+
|
|
1633
|
+
Every cumulative result is added as a separate field in a Struct column.
|
|
1634
|
+
|
|
1635
|
+
Parameters
|
|
1636
|
+
----------
|
|
1637
|
+
function
|
|
1638
|
+
Function to apply over the accumulator and the value.
|
|
1639
|
+
Fn(acc, value) -> new_value
|
|
1640
|
+
exprs
|
|
1641
|
+
Expressions to aggregate over. May also be a wildcard expression.
|
|
1642
|
+
return_dtype
|
|
1643
|
+
Output datatype.
|
|
1644
|
+
If not set, the dtype will be inferred based on the dtype of the input
|
|
1645
|
+
expressions.
|
|
1646
|
+
include_init
|
|
1647
|
+
Include the initial accumulator state as struct field.
|
|
1648
|
+
|
|
1649
|
+
Examples
|
|
1650
|
+
--------
|
|
1651
|
+
>>> df = pl.DataFrame(
|
|
1652
|
+
... {
|
|
1653
|
+
... "a": [1, 2, 3],
|
|
1654
|
+
... "b": [3, 4, 5],
|
|
1655
|
+
... "c": [5, 6, 7],
|
|
1656
|
+
... }
|
|
1657
|
+
... )
|
|
1658
|
+
>>> df.with_columns(pl.cum_reduce(function=lambda acc, x: acc + x, exprs=pl.all()))
|
|
1659
|
+
shape: (3, 4)
|
|
1660
|
+
┌─────┬─────┬─────┬────────────┐
|
|
1661
|
+
│ a ┆ b ┆ c ┆ cum_reduce │
|
|
1662
|
+
│ --- ┆ --- ┆ --- ┆ --- │
|
|
1663
|
+
│ i64 ┆ i64 ┆ i64 ┆ struct[3] │
|
|
1664
|
+
╞═════╪═════╪═════╪════════════╡
|
|
1665
|
+
│ 1 ┆ 3 ┆ 5 ┆ {1,4,9} │
|
|
1666
|
+
│ 2 ┆ 4 ┆ 6 ┆ {2,6,12} │
|
|
1667
|
+
│ 3 ┆ 5 ┆ 7 ┆ {3,8,15} │
|
|
1668
|
+
└─────┴─────┴─────┴────────────┘
|
|
1669
|
+
"""
|
|
1670
|
+
# in case of col("*")
|
|
1671
|
+
if isinstance(exprs, pl.Expr):
|
|
1672
|
+
exprs = [exprs]
|
|
1673
|
+
|
|
1674
|
+
rt: plr.PyDataTypeExpr | None = None
|
|
1675
|
+
if return_dtype is not None:
|
|
1676
|
+
rt = parse_into_datatype_expr(return_dtype)._pydatatype_expr
|
|
1677
|
+
|
|
1678
|
+
pyexprs = parse_into_list_of_expressions(exprs)
|
|
1679
|
+
return wrap_expr(
|
|
1680
|
+
plr.cum_reduce(
|
|
1681
|
+
_wrap_acc_lamba(function),
|
|
1682
|
+
pyexprs,
|
|
1683
|
+
returns_scalar=returns_scalar,
|
|
1684
|
+
return_dtype=rt,
|
|
1685
|
+
).alias("cum_reduce")
|
|
1686
|
+
)
|
|
1687
|
+
|
|
1688
|
+
|
|
1689
|
+
def arctan2(y: str | Expr, x: str | Expr) -> Expr:
|
|
1690
|
+
"""
|
|
1691
|
+
Compute two argument arctan in radians.
|
|
1692
|
+
|
|
1693
|
+
Returns the angle (in radians) in the plane between the
|
|
1694
|
+
positive x-axis and the ray from the origin to (x,y).
|
|
1695
|
+
|
|
1696
|
+
Parameters
|
|
1697
|
+
----------
|
|
1698
|
+
y
|
|
1699
|
+
Column name or Expression.
|
|
1700
|
+
x
|
|
1701
|
+
Column name or Expression.
|
|
1702
|
+
|
|
1703
|
+
Examples
|
|
1704
|
+
--------
|
|
1705
|
+
>>> c = (2**0.5) / 2
|
|
1706
|
+
>>> df = pl.DataFrame(
|
|
1707
|
+
... {
|
|
1708
|
+
... "y": [c, -c, c, -c],
|
|
1709
|
+
... "x": [c, c, -c, -c],
|
|
1710
|
+
... }
|
|
1711
|
+
... )
|
|
1712
|
+
>>> df.with_columns(pl.arctan2("y", "x").alias("atan2"))
|
|
1713
|
+
shape: (4, 3)
|
|
1714
|
+
┌───────────┬───────────┬───────────┐
|
|
1715
|
+
│ y ┆ x ┆ atan2 │
|
|
1716
|
+
│ --- ┆ --- ┆ --- │
|
|
1717
|
+
│ f64 ┆ f64 ┆ f64 │
|
|
1718
|
+
╞═══════════╪═══════════╪═══════════╡
|
|
1719
|
+
│ 0.707107 ┆ 0.707107 ┆ 0.785398 │
|
|
1720
|
+
│ -0.707107 ┆ 0.707107 ┆ -0.785398 │
|
|
1721
|
+
│ 0.707107 ┆ -0.707107 ┆ 2.356194 │
|
|
1722
|
+
│ -0.707107 ┆ -0.707107 ┆ -2.356194 │
|
|
1723
|
+
└───────────┴───────────┴───────────┘
|
|
1724
|
+
"""
|
|
1725
|
+
if isinstance(y, str):
|
|
1726
|
+
y = F.col(y)
|
|
1727
|
+
if isinstance(x, str):
|
|
1728
|
+
x = F.col(x)
|
|
1729
|
+
if not hasattr(x, "_pyexpr"):
|
|
1730
|
+
msg = f"`arctan2` expected a `str` or `Expr` got a `{qualified_type_name(x)}`"
|
|
1731
|
+
raise TypeError(msg)
|
|
1732
|
+
if not hasattr(y, "_pyexpr"):
|
|
1733
|
+
msg = f"`arctan2` expected a `str` or `Expr` got a `{qualified_type_name(y)}`"
|
|
1734
|
+
raise TypeError(msg)
|
|
1735
|
+
|
|
1736
|
+
return wrap_expr(plr.arctan2(y._pyexpr, x._pyexpr))
|
|
1737
|
+
|
|
1738
|
+
|
|
1739
|
+
@deprecated("`arctan2d` is deprecated; use `arctan2` followed by `.degrees()` instead.")
|
|
1740
|
+
def arctan2d(y: str | Expr, x: str | Expr) -> Expr:
|
|
1741
|
+
"""
|
|
1742
|
+
Compute two argument arctan in degrees.
|
|
1743
|
+
|
|
1744
|
+
.. deprecated:: 1.0.0
|
|
1745
|
+
Use `arctan2` followed by :meth:`Expr.degrees` instead.
|
|
1746
|
+
|
|
1747
|
+
Returns the angle (in degrees) in the plane between the positive x-axis
|
|
1748
|
+
and the ray from the origin to (x,y).
|
|
1749
|
+
|
|
1750
|
+
Parameters
|
|
1751
|
+
----------
|
|
1752
|
+
y
|
|
1753
|
+
Column name or Expression.
|
|
1754
|
+
x
|
|
1755
|
+
Column name or Expression.
|
|
1756
|
+
|
|
1757
|
+
Examples
|
|
1758
|
+
--------
|
|
1759
|
+
>>> c = (2**0.5) / 2
|
|
1760
|
+
>>> df = pl.DataFrame(
|
|
1761
|
+
... {
|
|
1762
|
+
... "y": [c, -c, c, -c],
|
|
1763
|
+
... "x": [c, c, -c, -c],
|
|
1764
|
+
... }
|
|
1765
|
+
... )
|
|
1766
|
+
>>> df.select( # doctest: +SKIP
|
|
1767
|
+
... pl.arctan2d("y", "x").alias("atan2d"),
|
|
1768
|
+
... pl.arctan2("y", "x").alias("atan2"),
|
|
1769
|
+
... )
|
|
1770
|
+
shape: (4, 2)
|
|
1771
|
+
┌────────┬───────────┐
|
|
1772
|
+
│ atan2d ┆ atan2 │
|
|
1773
|
+
│ --- ┆ --- │
|
|
1774
|
+
│ f64 ┆ f64 │
|
|
1775
|
+
╞════════╪═══════════╡
|
|
1776
|
+
│ 45.0 ┆ 0.785398 │
|
|
1777
|
+
│ -45.0 ┆ -0.785398 │
|
|
1778
|
+
│ 135.0 ┆ 2.356194 │
|
|
1779
|
+
│ -135.0 ┆ -2.356194 │
|
|
1780
|
+
└────────┴───────────┘
|
|
1781
|
+
"""
|
|
1782
|
+
return arctan2(y, x).degrees()
|
|
1783
|
+
|
|
1784
|
+
|
|
1785
|
+
def exclude(
|
|
1786
|
+
columns: str | PolarsDataType | Collection[str] | Collection[PolarsDataType],
|
|
1787
|
+
*more_columns: str | PolarsDataType,
|
|
1788
|
+
) -> Expr:
|
|
1789
|
+
"""
|
|
1790
|
+
Represent all columns except for the given columns.
|
|
1791
|
+
|
|
1792
|
+
Syntactic sugar for `pl.all().exclude(columns)`.
|
|
1793
|
+
|
|
1794
|
+
Parameters
|
|
1795
|
+
----------
|
|
1796
|
+
columns
|
|
1797
|
+
The name or datatype of the column(s) to exclude. Accepts regular expression
|
|
1798
|
+
input. Regular expressions should start with `^` and end with `$`.
|
|
1799
|
+
*more_columns
|
|
1800
|
+
Additional names or datatypes of columns to exclude, specified as positional
|
|
1801
|
+
arguments.
|
|
1802
|
+
|
|
1803
|
+
Examples
|
|
1804
|
+
--------
|
|
1805
|
+
Exclude by column name(s):
|
|
1806
|
+
|
|
1807
|
+
>>> df = pl.DataFrame(
|
|
1808
|
+
... {
|
|
1809
|
+
... "aa": [1, 2, 3],
|
|
1810
|
+
... "ba": ["a", "b", None],
|
|
1811
|
+
... "cc": [None, 2.5, 1.5],
|
|
1812
|
+
... }
|
|
1813
|
+
... )
|
|
1814
|
+
>>> df.select(pl.exclude("ba"))
|
|
1815
|
+
shape: (3, 2)
|
|
1816
|
+
┌─────┬──────┐
|
|
1817
|
+
│ aa ┆ cc │
|
|
1818
|
+
│ --- ┆ --- │
|
|
1819
|
+
│ i64 ┆ f64 │
|
|
1820
|
+
╞═════╪══════╡
|
|
1821
|
+
│ 1 ┆ null │
|
|
1822
|
+
│ 2 ┆ 2.5 │
|
|
1823
|
+
│ 3 ┆ 1.5 │
|
|
1824
|
+
└─────┴──────┘
|
|
1825
|
+
|
|
1826
|
+
Exclude by regex, e.g. removing all columns whose names end with the letter "a":
|
|
1827
|
+
|
|
1828
|
+
>>> df.select(pl.exclude("^.*a$"))
|
|
1829
|
+
shape: (3, 1)
|
|
1830
|
+
┌──────┐
|
|
1831
|
+
│ cc │
|
|
1832
|
+
│ --- │
|
|
1833
|
+
│ f64 │
|
|
1834
|
+
╞══════╡
|
|
1835
|
+
│ null │
|
|
1836
|
+
│ 2.5 │
|
|
1837
|
+
│ 1.5 │
|
|
1838
|
+
└──────┘
|
|
1839
|
+
|
|
1840
|
+
Exclude by dtype(s), e.g. removing all columns of type Int64 or Float64:
|
|
1841
|
+
|
|
1842
|
+
>>> df.select(pl.exclude([pl.Int64, pl.Float64]))
|
|
1843
|
+
shape: (3, 1)
|
|
1844
|
+
┌──────┐
|
|
1845
|
+
│ ba │
|
|
1846
|
+
│ --- │
|
|
1847
|
+
│ str │
|
|
1848
|
+
╞══════╡
|
|
1849
|
+
│ a │
|
|
1850
|
+
│ b │
|
|
1851
|
+
│ null │
|
|
1852
|
+
└──────┘
|
|
1853
|
+
|
|
1854
|
+
"""
|
|
1855
|
+
return F.col("*").exclude(columns, *more_columns)
|
|
1856
|
+
|
|
1857
|
+
|
|
1858
|
+
def groups(column: str) -> Expr:
|
|
1859
|
+
"""Syntactic sugar for `pl.col("foo").agg_groups()`."""
|
|
1860
|
+
return F.col(column).agg_groups()
|
|
1861
|
+
|
|
1862
|
+
|
|
1863
|
+
def quantile(
|
|
1864
|
+
column: str,
|
|
1865
|
+
quantile: float | Expr,
|
|
1866
|
+
interpolation: QuantileMethod = "nearest",
|
|
1867
|
+
) -> Expr:
|
|
1868
|
+
"""
|
|
1869
|
+
Syntactic sugar for `pl.col("foo").quantile(..)`.
|
|
1870
|
+
|
|
1871
|
+
Parameters
|
|
1872
|
+
----------
|
|
1873
|
+
column
|
|
1874
|
+
Column name.
|
|
1875
|
+
quantile
|
|
1876
|
+
Quantile between 0.0 and 1.0.
|
|
1877
|
+
interpolation : {'nearest', 'higher', 'lower', 'midpoint', 'linear', 'equiprobable'}
|
|
1878
|
+
Interpolation method.
|
|
1879
|
+
"""
|
|
1880
|
+
return F.col(column).quantile(quantile, interpolation)
|
|
1881
|
+
|
|
1882
|
+
|
|
1883
|
+
def arg_sort_by(
|
|
1884
|
+
exprs: IntoExpr | Iterable[IntoExpr],
|
|
1885
|
+
*more_exprs: IntoExpr,
|
|
1886
|
+
descending: bool | Sequence[bool] = False,
|
|
1887
|
+
nulls_last: bool | Sequence[bool] = False,
|
|
1888
|
+
multithreaded: bool = True,
|
|
1889
|
+
maintain_order: bool = False,
|
|
1890
|
+
) -> Expr:
|
|
1891
|
+
"""
|
|
1892
|
+
Return the row indices that would sort the column(s).
|
|
1893
|
+
|
|
1894
|
+
Parameters
|
|
1895
|
+
----------
|
|
1896
|
+
exprs
|
|
1897
|
+
Column(s) to arg sort by. Accepts expression input. Strings are parsed as column
|
|
1898
|
+
names.
|
|
1899
|
+
*more_exprs
|
|
1900
|
+
Additional columns to arg sort by, specified as positional arguments.
|
|
1901
|
+
descending
|
|
1902
|
+
Sort in descending order. When sorting by multiple columns, can be specified
|
|
1903
|
+
per column by passing a sequence of booleans.
|
|
1904
|
+
nulls_last
|
|
1905
|
+
Place null values last.
|
|
1906
|
+
multithreaded
|
|
1907
|
+
Sort using multiple threads.
|
|
1908
|
+
maintain_order
|
|
1909
|
+
Whether the order should be maintained if elements are equal.
|
|
1910
|
+
|
|
1911
|
+
See Also
|
|
1912
|
+
--------
|
|
1913
|
+
Expr.gather: Take values by index.
|
|
1914
|
+
Expr.rank : Get the rank of each row.
|
|
1915
|
+
|
|
1916
|
+
Examples
|
|
1917
|
+
--------
|
|
1918
|
+
Pass a single column name to compute the arg sort by that column.
|
|
1919
|
+
|
|
1920
|
+
>>> df = pl.DataFrame(
|
|
1921
|
+
... {
|
|
1922
|
+
... "a": [0, 1, 1, 0],
|
|
1923
|
+
... "b": [3, 2, 3, 2],
|
|
1924
|
+
... "c": [1, 2, 3, 4],
|
|
1925
|
+
... }
|
|
1926
|
+
... )
|
|
1927
|
+
>>> df.select(pl.arg_sort_by("a"))
|
|
1928
|
+
shape: (4, 1)
|
|
1929
|
+
┌─────┐
|
|
1930
|
+
│ a │
|
|
1931
|
+
│ --- │
|
|
1932
|
+
│ u32 │
|
|
1933
|
+
╞═════╡
|
|
1934
|
+
│ 0 │
|
|
1935
|
+
│ 3 │
|
|
1936
|
+
│ 1 │
|
|
1937
|
+
│ 2 │
|
|
1938
|
+
└─────┘
|
|
1939
|
+
|
|
1940
|
+
Compute the arg sort by multiple columns by either passing a list of columns, or by
|
|
1941
|
+
specifying each column as a positional argument.
|
|
1942
|
+
|
|
1943
|
+
>>> df.select(pl.arg_sort_by(["a", "b"], descending=True))
|
|
1944
|
+
shape: (4, 1)
|
|
1945
|
+
┌─────┐
|
|
1946
|
+
│ a │
|
|
1947
|
+
│ --- │
|
|
1948
|
+
│ u32 │
|
|
1949
|
+
╞═════╡
|
|
1950
|
+
│ 2 │
|
|
1951
|
+
│ 1 │
|
|
1952
|
+
│ 0 │
|
|
1953
|
+
│ 3 │
|
|
1954
|
+
└─────┘
|
|
1955
|
+
|
|
1956
|
+
Use gather to apply the arg sort to other columns.
|
|
1957
|
+
|
|
1958
|
+
>>> df.select(pl.col("c").gather(pl.arg_sort_by("a")))
|
|
1959
|
+
shape: (4, 1)
|
|
1960
|
+
┌─────┐
|
|
1961
|
+
│ c │
|
|
1962
|
+
│ --- │
|
|
1963
|
+
│ i64 │
|
|
1964
|
+
╞═════╡
|
|
1965
|
+
│ 1 │
|
|
1966
|
+
│ 4 │
|
|
1967
|
+
│ 2 │
|
|
1968
|
+
│ 3 │
|
|
1969
|
+
└─────┘
|
|
1970
|
+
"""
|
|
1971
|
+
exprs = parse_into_list_of_expressions(exprs, *more_exprs)
|
|
1972
|
+
descending = extend_bool(descending, len(exprs), "descending", "exprs")
|
|
1973
|
+
nulls_last = extend_bool(nulls_last, len(exprs), "nulls_last", "exprs")
|
|
1974
|
+
return wrap_expr(
|
|
1975
|
+
plr.arg_sort_by(exprs, descending, nulls_last, multithreaded, maintain_order)
|
|
1976
|
+
)
|
|
1977
|
+
|
|
1978
|
+
|
|
1979
|
+
@deprecate_streaming_parameter()
|
|
1980
|
+
@forward_old_opt_flags()
|
|
1981
|
+
def collect_all(
|
|
1982
|
+
lazy_frames: Iterable[LazyFrame],
|
|
1983
|
+
*,
|
|
1984
|
+
type_coercion: bool = True,
|
|
1985
|
+
predicate_pushdown: bool = True,
|
|
1986
|
+
projection_pushdown: bool = True,
|
|
1987
|
+
simplify_expression: bool = True,
|
|
1988
|
+
no_optimization: bool = False,
|
|
1989
|
+
slice_pushdown: bool = True,
|
|
1990
|
+
comm_subplan_elim: bool = True,
|
|
1991
|
+
comm_subexpr_elim: bool = True,
|
|
1992
|
+
cluster_with_columns: bool = True,
|
|
1993
|
+
collapse_joins: bool = True,
|
|
1994
|
+
optimizations: QueryOptFlags = DEFAULT_QUERY_OPT_FLAGS,
|
|
1995
|
+
engine: EngineType = "auto",
|
|
1996
|
+
) -> list[DataFrame]:
|
|
1997
|
+
"""
|
|
1998
|
+
Collect multiple LazyFrames at the same time.
|
|
1999
|
+
|
|
2000
|
+
This can run all the computation graphs in parallel or combined.
|
|
2001
|
+
|
|
2002
|
+
Common Subplan Elimination is applied on the combined plan, meaning
|
|
2003
|
+
that diverging queries will run only once.
|
|
2004
|
+
|
|
2005
|
+
Parameters
|
|
2006
|
+
----------
|
|
2007
|
+
lazy_frames
|
|
2008
|
+
A list of LazyFrames to collect.
|
|
2009
|
+
type_coercion
|
|
2010
|
+
Do type coercion optimization.
|
|
2011
|
+
|
|
2012
|
+
.. deprecated:: 1.30.0
|
|
2013
|
+
Use the `optimizations` parameters.
|
|
2014
|
+
predicate_pushdown
|
|
2015
|
+
Do predicate pushdown optimization.
|
|
2016
|
+
|
|
2017
|
+
.. deprecated:: 1.30.0
|
|
2018
|
+
Use the `optimizations` parameters.
|
|
2019
|
+
projection_pushdown
|
|
2020
|
+
Do projection pushdown optimization.
|
|
2021
|
+
|
|
2022
|
+
.. deprecated:: 1.30.0
|
|
2023
|
+
Use the `optimizations` parameters.
|
|
2024
|
+
simplify_expression
|
|
2025
|
+
Run simplify expressions optimization.
|
|
2026
|
+
|
|
2027
|
+
.. deprecated:: 1.30.0
|
|
2028
|
+
Use the `optimizations` parameters.
|
|
2029
|
+
no_optimization
|
|
2030
|
+
Turn off optimizations.
|
|
2031
|
+
|
|
2032
|
+
.. deprecated:: 1.30.0
|
|
2033
|
+
Use the `optimizations` parameters.
|
|
2034
|
+
slice_pushdown
|
|
2035
|
+
Slice pushdown optimization.
|
|
2036
|
+
|
|
2037
|
+
.. deprecated:: 1.30.0
|
|
2038
|
+
Use the `optimizations` parameters.
|
|
2039
|
+
comm_subplan_elim
|
|
2040
|
+
Will try to cache branching subplans that occur on self-joins or unions.
|
|
2041
|
+
|
|
2042
|
+
.. deprecated:: 1.30.0
|
|
2043
|
+
Use the `optimizations` parameters.
|
|
2044
|
+
comm_subexpr_elim
|
|
2045
|
+
Common subexpressions will be cached and reused.
|
|
2046
|
+
|
|
2047
|
+
.. deprecated:: 1.30.0
|
|
2048
|
+
Use the `optimizations` parameters.
|
|
2049
|
+
cluster_with_columns
|
|
2050
|
+
Combine sequential independent calls to with_columns
|
|
2051
|
+
|
|
2052
|
+
.. deprecated:: 1.30.0
|
|
2053
|
+
Use the `optimizations` parameters.
|
|
2054
|
+
collapse_joins
|
|
2055
|
+
Collapse a join and filters into a faster join
|
|
2056
|
+
|
|
2057
|
+
.. deprecated:: 1.30.0
|
|
2058
|
+
Use the `optimizations` parameters.
|
|
2059
|
+
optimizations
|
|
2060
|
+
The optimization passes done during query optimization.
|
|
2061
|
+
|
|
2062
|
+
.. warning::
|
|
2063
|
+
This functionality is considered **unstable**. It may be changed
|
|
2064
|
+
at any point without it being considered a breaking change.
|
|
2065
|
+
engine
|
|
2066
|
+
Select the engine used to process the query, optional.
|
|
2067
|
+
At the moment, if set to `"auto"` (default), the query
|
|
2068
|
+
is run using the polars in-memory engine. Polars will also
|
|
2069
|
+
attempt to use the engine set by the `POLARS_ENGINE_AFFINITY`
|
|
2070
|
+
environment variable. If it cannot run the query using the
|
|
2071
|
+
selected engine, the query is run using the polars in-memory
|
|
2072
|
+
engine.
|
|
2073
|
+
|
|
2074
|
+
.. note::
|
|
2075
|
+
The GPU engine does not support async, or running in the
|
|
2076
|
+
background. If either are enabled, then GPU execution is switched off.
|
|
2077
|
+
|
|
2078
|
+
Returns
|
|
2079
|
+
-------
|
|
2080
|
+
list of DataFrames
|
|
2081
|
+
The collected DataFrames, returned in the same order as the input LazyFrames.
|
|
2082
|
+
|
|
2083
|
+
"""
|
|
2084
|
+
if engine == "streaming":
|
|
2085
|
+
issue_unstable_warning("streaming mode is considered unstable.")
|
|
2086
|
+
|
|
2087
|
+
lfs = [lf._ldf for lf in lazy_frames]
|
|
2088
|
+
out = plr.collect_all(lfs, engine, optimizations._pyoptflags)
|
|
2089
|
+
|
|
2090
|
+
# wrap the pydataframes into dataframe
|
|
2091
|
+
result = [wrap_df(pydf) for pydf in out]
|
|
2092
|
+
|
|
2093
|
+
return result
|
|
2094
|
+
|
|
2095
|
+
|
|
2096
|
+
@overload
|
|
2097
|
+
def collect_all_async(
|
|
2098
|
+
lazy_frames: Iterable[LazyFrame],
|
|
2099
|
+
*,
|
|
2100
|
+
gevent: Literal[True],
|
|
2101
|
+
engine: EngineType = "auto",
|
|
2102
|
+
optimizations: QueryOptFlags = DEFAULT_QUERY_OPT_FLAGS,
|
|
2103
|
+
) -> _GeventDataFrameResult[list[DataFrame]]: ...
|
|
2104
|
+
|
|
2105
|
+
|
|
2106
|
+
@overload
|
|
2107
|
+
def collect_all_async(
|
|
2108
|
+
lazy_frames: Iterable[LazyFrame],
|
|
2109
|
+
*,
|
|
2110
|
+
gevent: Literal[False] = False,
|
|
2111
|
+
engine: EngineType = "auto",
|
|
2112
|
+
optimizations: QueryOptFlags = DEFAULT_QUERY_OPT_FLAGS,
|
|
2113
|
+
) -> Awaitable[list[DataFrame]]: ...
|
|
2114
|
+
|
|
2115
|
+
|
|
2116
|
+
@unstable()
|
|
2117
|
+
@deprecate_streaming_parameter()
|
|
2118
|
+
def collect_all_async(
|
|
2119
|
+
lazy_frames: Iterable[LazyFrame],
|
|
2120
|
+
*,
|
|
2121
|
+
gevent: bool = False,
|
|
2122
|
+
engine: EngineType = "auto",
|
|
2123
|
+
optimizations: QueryOptFlags = DEFAULT_QUERY_OPT_FLAGS,
|
|
2124
|
+
) -> Awaitable[list[DataFrame]] | _GeventDataFrameResult[list[DataFrame]]:
|
|
2125
|
+
"""
|
|
2126
|
+
Collect multiple LazyFrames at the same time asynchronously in thread pool.
|
|
2127
|
+
|
|
2128
|
+
.. warning::
|
|
2129
|
+
This functionality is considered **unstable**. It may be changed
|
|
2130
|
+
at any point without it being considered a breaking change.
|
|
2131
|
+
|
|
2132
|
+
Collects into a list of DataFrame (like :func:`polars.collect_all`),
|
|
2133
|
+
but instead of returning them directly, they are scheduled to be collected
|
|
2134
|
+
inside thread pool, while this method returns almost instantly.
|
|
2135
|
+
|
|
2136
|
+
May be useful if you use gevent or asyncio and want to release control to other
|
|
2137
|
+
greenlets/tasks while LazyFrames are being collected.
|
|
2138
|
+
|
|
2139
|
+
Parameters
|
|
2140
|
+
----------
|
|
2141
|
+
lazy_frames
|
|
2142
|
+
A list of LazyFrames to collect.
|
|
2143
|
+
gevent
|
|
2144
|
+
Return wrapper to `gevent.event.AsyncResult` instead of Awaitable
|
|
2145
|
+
optimizations
|
|
2146
|
+
The optimization passes done during query optimization.
|
|
2147
|
+
|
|
2148
|
+
.. warning::
|
|
2149
|
+
This functionality is considered **unstable**. It may be changed
|
|
2150
|
+
at any point without it being considered a breaking change.
|
|
2151
|
+
engine
|
|
2152
|
+
Select the engine used to process the query, optional.
|
|
2153
|
+
At the moment, if set to `"auto"` (default), the query
|
|
2154
|
+
is run using the polars in-memory engine. Polars will also
|
|
2155
|
+
attempt to use the engine set by the `POLARS_ENGINE_AFFINITY`
|
|
2156
|
+
environment variable. If it cannot run the query using the
|
|
2157
|
+
selected engine, the query is run using the polars in-memory
|
|
2158
|
+
engine.
|
|
2159
|
+
|
|
2160
|
+
.. note::
|
|
2161
|
+
The GPU engine does not support async, or running in the
|
|
2162
|
+
background. If either are enabled, then GPU execution is switched off.
|
|
2163
|
+
|
|
2164
|
+
See Also
|
|
2165
|
+
--------
|
|
2166
|
+
polars.collect_all : Collect multiple LazyFrames at the same time.
|
|
2167
|
+
LazyFrame.collect_async : To collect single frame.
|
|
2168
|
+
|
|
2169
|
+
Notes
|
|
2170
|
+
-----
|
|
2171
|
+
In case of error `set_exception` is used on
|
|
2172
|
+
`asyncio.Future`/`gevent.event.AsyncResult` and will be reraised by them.
|
|
2173
|
+
|
|
2174
|
+
Returns
|
|
2175
|
+
-------
|
|
2176
|
+
If `gevent=False` (default) then returns awaitable.
|
|
2177
|
+
|
|
2178
|
+
If `gevent=True` then returns wrapper that has
|
|
2179
|
+
`.get(block=True, timeout=None)` method.
|
|
2180
|
+
"""
|
|
2181
|
+
if engine == "streaming":
|
|
2182
|
+
issue_unstable_warning("streaming mode is considered unstable.")
|
|
2183
|
+
|
|
2184
|
+
result: (
|
|
2185
|
+
_GeventDataFrameResult[list[DataFrame]] | _AioDataFrameResult[list[DataFrame]]
|
|
2186
|
+
) = _GeventDataFrameResult() if gevent else _AioDataFrameResult()
|
|
2187
|
+
lfs = [lf._ldf for lf in lazy_frames]
|
|
2188
|
+
plr.collect_all_with_callback(
|
|
2189
|
+
lfs, engine, optimizations._pyoptflags, result._callback_all
|
|
2190
|
+
)
|
|
2191
|
+
return result
|
|
2192
|
+
|
|
2193
|
+
|
|
2194
|
+
@unstable()
|
|
2195
|
+
def explain_all(
|
|
2196
|
+
lazy_frames: Iterable[LazyFrame],
|
|
2197
|
+
*,
|
|
2198
|
+
optimizations: QueryOptFlags = DEFAULT_QUERY_OPT_FLAGS,
|
|
2199
|
+
) -> str:
|
|
2200
|
+
"""
|
|
2201
|
+
Explain multiple LazyFrames as if passed to `collect_all`.
|
|
2202
|
+
|
|
2203
|
+
Common Subplan Elimination is applied on the combined plan, meaning
|
|
2204
|
+
that diverging queries will run only once.
|
|
2205
|
+
|
|
2206
|
+
Parameters
|
|
2207
|
+
----------
|
|
2208
|
+
lazy_frames
|
|
2209
|
+
A list of LazyFrames to collect.
|
|
2210
|
+
optimizations
|
|
2211
|
+
The optimization passes done during query optimization.
|
|
2212
|
+
|
|
2213
|
+
.. warning::
|
|
2214
|
+
This functionality is considered **unstable**. It may be changed
|
|
2215
|
+
at any point without it being considered a breaking change.
|
|
2216
|
+
|
|
2217
|
+
Returns
|
|
2218
|
+
-------
|
|
2219
|
+
Explained plan.
|
|
2220
|
+
"""
|
|
2221
|
+
lfs = [lf._ldf for lf in lazy_frames]
|
|
2222
|
+
return plr.explain_all(lfs, optimizations._pyoptflags)
|
|
2223
|
+
|
|
2224
|
+
|
|
2225
|
+
@overload
|
|
2226
|
+
def select(
|
|
2227
|
+
*exprs: IntoExpr | Iterable[IntoExpr],
|
|
2228
|
+
eager: Literal[True] = ...,
|
|
2229
|
+
**named_exprs: IntoExpr,
|
|
2230
|
+
) -> DataFrame: ...
|
|
2231
|
+
|
|
2232
|
+
|
|
2233
|
+
@overload
|
|
2234
|
+
def select(
|
|
2235
|
+
*exprs: IntoExpr | Iterable[IntoExpr],
|
|
2236
|
+
eager: Literal[False],
|
|
2237
|
+
**named_exprs: IntoExpr,
|
|
2238
|
+
) -> LazyFrame: ...
|
|
2239
|
+
|
|
2240
|
+
|
|
2241
|
+
def select(
|
|
2242
|
+
*exprs: IntoExpr | Iterable[IntoExpr], eager: bool = True, **named_exprs: IntoExpr
|
|
2243
|
+
) -> DataFrame | LazyFrame:
|
|
2244
|
+
"""
|
|
2245
|
+
Run polars expressions without a context.
|
|
2246
|
+
|
|
2247
|
+
This is syntactic sugar for running `df.select` on an empty DataFrame
|
|
2248
|
+
(or LazyFrame if eager=False).
|
|
2249
|
+
|
|
2250
|
+
Parameters
|
|
2251
|
+
----------
|
|
2252
|
+
*exprs
|
|
2253
|
+
Column(s) to select, specified as positional arguments.
|
|
2254
|
+
Accepts expression input. Strings are parsed as column names,
|
|
2255
|
+
other non-expression inputs are parsed as literals.
|
|
2256
|
+
eager
|
|
2257
|
+
Evaluate immediately and return a `DataFrame` (default); if set to `False`,
|
|
2258
|
+
return a `LazyFrame` instead.
|
|
2259
|
+
**named_exprs
|
|
2260
|
+
Additional columns to select, specified as keyword arguments.
|
|
2261
|
+
The columns will be renamed to the keyword used.
|
|
2262
|
+
|
|
2263
|
+
Returns
|
|
2264
|
+
-------
|
|
2265
|
+
DataFrame or LazyFrame
|
|
2266
|
+
|
|
2267
|
+
Examples
|
|
2268
|
+
--------
|
|
2269
|
+
>>> foo = pl.Series("foo", [1, 2, 3])
|
|
2270
|
+
>>> bar = pl.Series("bar", [3, 2, 1])
|
|
2271
|
+
>>> pl.select(min=pl.min_horizontal(foo, bar))
|
|
2272
|
+
shape: (3, 1)
|
|
2273
|
+
┌─────┐
|
|
2274
|
+
│ min │
|
|
2275
|
+
│ --- │
|
|
2276
|
+
│ i64 │
|
|
2277
|
+
╞═════╡
|
|
2278
|
+
│ 1 │
|
|
2279
|
+
│ 2 │
|
|
2280
|
+
│ 1 │
|
|
2281
|
+
└─────┘
|
|
2282
|
+
|
|
2283
|
+
>>> pl.select(pl.int_range(0, 100_000, 2).alias("n"), eager=False).filter(
|
|
2284
|
+
... pl.col("n") % 22_500 == 0
|
|
2285
|
+
... ).collect()
|
|
2286
|
+
shape: (5, 1)
|
|
2287
|
+
┌───────┐
|
|
2288
|
+
│ n │
|
|
2289
|
+
│ --- │
|
|
2290
|
+
│ i64 │
|
|
2291
|
+
╞═══════╡
|
|
2292
|
+
│ 0 │
|
|
2293
|
+
│ 22500 │
|
|
2294
|
+
│ 45000 │
|
|
2295
|
+
│ 67500 │
|
|
2296
|
+
│ 90000 │
|
|
2297
|
+
└───────┘
|
|
2298
|
+
"""
|
|
2299
|
+
empty_frame = pl.DataFrame() if eager else pl.LazyFrame()
|
|
2300
|
+
return empty_frame.select(*exprs, **named_exprs)
|
|
2301
|
+
|
|
2302
|
+
|
|
2303
|
+
@overload
|
|
2304
|
+
def arg_where(condition: Expr | Series, *, eager: Literal[False] = ...) -> Expr: ...
|
|
2305
|
+
|
|
2306
|
+
|
|
2307
|
+
@overload
|
|
2308
|
+
def arg_where(condition: Expr | Series, *, eager: Literal[True]) -> Series: ...
|
|
2309
|
+
|
|
2310
|
+
|
|
2311
|
+
def arg_where(condition: Expr | Series, *, eager: bool = False) -> Expr | Series:
|
|
2312
|
+
"""
|
|
2313
|
+
Return indices where `condition` evaluates `True`.
|
|
2314
|
+
|
|
2315
|
+
Parameters
|
|
2316
|
+
----------
|
|
2317
|
+
condition
|
|
2318
|
+
Boolean expression to evaluate
|
|
2319
|
+
eager
|
|
2320
|
+
Evaluate immediately and return a `Series`; this requires that the given
|
|
2321
|
+
condition is itself a `Series`. If set to `False` (default), return
|
|
2322
|
+
an expression instead.
|
|
2323
|
+
|
|
2324
|
+
See Also
|
|
2325
|
+
--------
|
|
2326
|
+
Series.arg_true : Return indices where Series is True
|
|
2327
|
+
|
|
2328
|
+
Examples
|
|
2329
|
+
--------
|
|
2330
|
+
>>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
|
|
2331
|
+
>>> df.select(
|
|
2332
|
+
... [
|
|
2333
|
+
... pl.arg_where(pl.col("a") % 2 == 0),
|
|
2334
|
+
... ]
|
|
2335
|
+
... ).to_series()
|
|
2336
|
+
shape: (2,)
|
|
2337
|
+
Series: 'a' [u32]
|
|
2338
|
+
[
|
|
2339
|
+
1
|
|
2340
|
+
3
|
|
2341
|
+
]
|
|
2342
|
+
"""
|
|
2343
|
+
if eager:
|
|
2344
|
+
if not isinstance(condition, pl.Series):
|
|
2345
|
+
msg = (
|
|
2346
|
+
"expected Series in 'arg_where' if 'eager=True', got"
|
|
2347
|
+
f" {type(condition).__name__!r}"
|
|
2348
|
+
)
|
|
2349
|
+
raise ValueError(msg)
|
|
2350
|
+
return condition.to_frame().select(arg_where(F.col(condition.name))).to_series()
|
|
2351
|
+
else:
|
|
2352
|
+
condition_pyexpr = parse_into_expression(condition)
|
|
2353
|
+
return wrap_expr(plr.arg_where(condition_pyexpr))
|
|
2354
|
+
|
|
2355
|
+
|
|
2356
|
+
@overload
|
|
2357
|
+
def coalesce(
|
|
2358
|
+
exprs: IntoExpr | Iterable[IntoExpr],
|
|
2359
|
+
*more_exprs: IntoExpr,
|
|
2360
|
+
eager: Literal[False] = ...,
|
|
2361
|
+
) -> Expr: ...
|
|
2362
|
+
|
|
2363
|
+
|
|
2364
|
+
@overload
|
|
2365
|
+
def coalesce(
|
|
2366
|
+
exprs: IntoExpr | Iterable[IntoExpr],
|
|
2367
|
+
*more_exprs: IntoExpr,
|
|
2368
|
+
eager: Literal[True],
|
|
2369
|
+
) -> Series: ...
|
|
2370
|
+
|
|
2371
|
+
|
|
2372
|
+
@overload
|
|
2373
|
+
def coalesce(
|
|
2374
|
+
exprs: IntoExpr | Iterable[IntoExpr],
|
|
2375
|
+
*more_exprs: IntoExpr,
|
|
2376
|
+
eager: bool,
|
|
2377
|
+
) -> Expr | Series: ...
|
|
2378
|
+
|
|
2379
|
+
|
|
2380
|
+
def coalesce(
|
|
2381
|
+
exprs: IntoExpr | Iterable[IntoExpr],
|
|
2382
|
+
*more_exprs: IntoExpr,
|
|
2383
|
+
eager: bool = False,
|
|
2384
|
+
) -> Expr | Series:
|
|
2385
|
+
"""
|
|
2386
|
+
Folds the columns from left to right, keeping the first non-null value.
|
|
2387
|
+
|
|
2388
|
+
Parameters
|
|
2389
|
+
----------
|
|
2390
|
+
exprs
|
|
2391
|
+
Columns to coalesce. Accepts expression input. Strings are parsed as column
|
|
2392
|
+
names, other non-expression inputs are parsed as literals.
|
|
2393
|
+
*more_exprs
|
|
2394
|
+
Additional columns to coalesce, specified as positional arguments.
|
|
2395
|
+
eager
|
|
2396
|
+
Evaluate immediately and return a `Series`; this requires that at least one
|
|
2397
|
+
of the given arguments is a `Series`. If set to `False` (default), return
|
|
2398
|
+
an expression instead.
|
|
2399
|
+
|
|
2400
|
+
Examples
|
|
2401
|
+
--------
|
|
2402
|
+
>>> df = pl.DataFrame(
|
|
2403
|
+
... {
|
|
2404
|
+
... "a": [1, None, None, None],
|
|
2405
|
+
... "b": [1, 2, None, None],
|
|
2406
|
+
... "c": [5, None, 3, None],
|
|
2407
|
+
... }
|
|
2408
|
+
... )
|
|
2409
|
+
|
|
2410
|
+
>>> df.with_columns(pl.coalesce("a", "b", "c", 10).alias("d"))
|
|
2411
|
+
shape: (4, 4)
|
|
2412
|
+
┌──────┬──────┬──────┬─────┐
|
|
2413
|
+
│ a ┆ b ┆ c ┆ d │
|
|
2414
|
+
│ --- ┆ --- ┆ --- ┆ --- │
|
|
2415
|
+
│ i64 ┆ i64 ┆ i64 ┆ i64 │
|
|
2416
|
+
╞══════╪══════╪══════╪═════╡
|
|
2417
|
+
│ 1 ┆ 1 ┆ 5 ┆ 1 │
|
|
2418
|
+
│ null ┆ 2 ┆ null ┆ 2 │
|
|
2419
|
+
│ null ┆ null ┆ 3 ┆ 3 │
|
|
2420
|
+
│ null ┆ null ┆ null ┆ 10 │
|
|
2421
|
+
└──────┴──────┴──────┴─────┘
|
|
2422
|
+
|
|
2423
|
+
>>> df.with_columns(pl.coalesce(pl.col(["a", "b", "c"]), 10.0).alias("d"))
|
|
2424
|
+
shape: (4, 4)
|
|
2425
|
+
┌──────┬──────┬──────┬──────┐
|
|
2426
|
+
│ a ┆ b ┆ c ┆ d │
|
|
2427
|
+
│ --- ┆ --- ┆ --- ┆ --- │
|
|
2428
|
+
│ i64 ┆ i64 ┆ i64 ┆ f64 │
|
|
2429
|
+
╞══════╪══════╪══════╪══════╡
|
|
2430
|
+
│ 1 ┆ 1 ┆ 5 ┆ 1.0 │
|
|
2431
|
+
│ null ┆ 2 ┆ null ┆ 2.0 │
|
|
2432
|
+
│ null ┆ null ┆ 3 ┆ 3.0 │
|
|
2433
|
+
│ null ┆ null ┆ null ┆ 10.0 │
|
|
2434
|
+
└──────┴──────┴──────┴──────┘
|
|
2435
|
+
|
|
2436
|
+
>>> s1 = pl.Series("a", [None, 2, None])
|
|
2437
|
+
>>> s2 = pl.Series("b", [1, None, 3])
|
|
2438
|
+
>>> pl.coalesce(s1, s2, eager=True)
|
|
2439
|
+
shape: (3,)
|
|
2440
|
+
Series: 'a' [i64]
|
|
2441
|
+
[
|
|
2442
|
+
1
|
|
2443
|
+
2
|
|
2444
|
+
3
|
|
2445
|
+
]
|
|
2446
|
+
"""
|
|
2447
|
+
if eager:
|
|
2448
|
+
exprs = [exprs, *more_exprs]
|
|
2449
|
+
if not (series := [e for e in exprs if isinstance(e, pl.Series)]):
|
|
2450
|
+
msg = "expected at least one Series in 'coalesce' if 'eager=True'"
|
|
2451
|
+
raise ValueError(msg)
|
|
2452
|
+
|
|
2453
|
+
exprs = [(e.name if isinstance(e, pl.Series) else e) for e in exprs]
|
|
2454
|
+
return pl.DataFrame(series).select(coalesce(exprs, eager=False)).to_series()
|
|
2455
|
+
else:
|
|
2456
|
+
exprs = parse_into_list_of_expressions(exprs, *more_exprs)
|
|
2457
|
+
return wrap_expr(plr.coalesce(exprs))
|
|
2458
|
+
|
|
2459
|
+
|
|
2460
|
+
@overload
|
|
2461
|
+
def from_epoch(column: str | Expr, time_unit: EpochTimeUnit = ...) -> Expr: ...
|
|
2462
|
+
|
|
2463
|
+
|
|
2464
|
+
@overload
|
|
2465
|
+
def from_epoch(
|
|
2466
|
+
column: Series | Sequence[int], time_unit: EpochTimeUnit = ...
|
|
2467
|
+
) -> Series: ...
|
|
2468
|
+
|
|
2469
|
+
|
|
2470
|
+
def from_epoch(
|
|
2471
|
+
column: str | Expr | Series | Sequence[int], time_unit: EpochTimeUnit = "s"
|
|
2472
|
+
) -> Expr | Series:
|
|
2473
|
+
"""
|
|
2474
|
+
Utility function that parses an epoch timestamp (or Unix time) to Polars Date(time).
|
|
2475
|
+
|
|
2476
|
+
Depending on the `time_unit` provided, this function will return a different dtype:
|
|
2477
|
+
|
|
2478
|
+
- time_unit="d" returns pl.Date
|
|
2479
|
+
- time_unit="s" returns pl.Datetime["us"] (pl.Datetime's default)
|
|
2480
|
+
- time_unit="ms" returns pl.Datetime["ms"]
|
|
2481
|
+
- time_unit="us" returns pl.Datetime["us"]
|
|
2482
|
+
- time_unit="ns" returns pl.Datetime["ns"]
|
|
2483
|
+
|
|
2484
|
+
Parameters
|
|
2485
|
+
----------
|
|
2486
|
+
column
|
|
2487
|
+
Series or expression to parse integers to pl.Datetime.
|
|
2488
|
+
time_unit
|
|
2489
|
+
The unit of time of the timesteps since epoch time.
|
|
2490
|
+
|
|
2491
|
+
Examples
|
|
2492
|
+
--------
|
|
2493
|
+
>>> df = pl.DataFrame({"timestamp": [1666683077, 1666683099]}).lazy()
|
|
2494
|
+
>>> df.select(pl.from_epoch(pl.col("timestamp"), time_unit="s")).collect()
|
|
2495
|
+
shape: (2, 1)
|
|
2496
|
+
┌─────────────────────┐
|
|
2497
|
+
│ timestamp │
|
|
2498
|
+
│ --- │
|
|
2499
|
+
│ datetime[μs] │
|
|
2500
|
+
╞═════════════════════╡
|
|
2501
|
+
│ 2022-10-25 07:31:17 │
|
|
2502
|
+
│ 2022-10-25 07:31:39 │
|
|
2503
|
+
└─────────────────────┘
|
|
2504
|
+
|
|
2505
|
+
The function can also be used in an eager context by passing a Series.
|
|
2506
|
+
|
|
2507
|
+
>>> s = pl.Series([12345, 12346])
|
|
2508
|
+
>>> pl.from_epoch(s, time_unit="d")
|
|
2509
|
+
shape: (2,)
|
|
2510
|
+
Series: '' [date]
|
|
2511
|
+
[
|
|
2512
|
+
2003-10-20
|
|
2513
|
+
2003-10-21
|
|
2514
|
+
]
|
|
2515
|
+
"""
|
|
2516
|
+
if isinstance(column, str):
|
|
2517
|
+
column = F.col(column)
|
|
2518
|
+
elif not isinstance(column, (pl.Series, pl.Expr)):
|
|
2519
|
+
column = pl.Series(column) # Sequence input handled by Series constructor
|
|
2520
|
+
|
|
2521
|
+
if time_unit == "d":
|
|
2522
|
+
return column.cast(Date)
|
|
2523
|
+
elif time_unit == "s":
|
|
2524
|
+
return (column.cast(Int64) * 1_000_000).cast(Datetime("us"))
|
|
2525
|
+
elif time_unit in DTYPE_TEMPORAL_UNITS:
|
|
2526
|
+
return column.cast(Datetime(time_unit))
|
|
2527
|
+
else:
|
|
2528
|
+
msg = f"`time_unit` must be one of {{'ns', 'us', 'ms', 's', 'd'}}, got {time_unit!r}"
|
|
2529
|
+
raise ValueError(msg)
|
|
2530
|
+
|
|
2531
|
+
|
|
2532
|
+
@deprecate_renamed_parameter("min_periods", "min_samples", version="1.21.0")
|
|
2533
|
+
def rolling_cov(
|
|
2534
|
+
a: str | Expr,
|
|
2535
|
+
b: str | Expr,
|
|
2536
|
+
*,
|
|
2537
|
+
window_size: int,
|
|
2538
|
+
min_samples: int | None = None,
|
|
2539
|
+
ddof: int = 1,
|
|
2540
|
+
) -> Expr:
|
|
2541
|
+
"""
|
|
2542
|
+
Compute the rolling covariance between two columns/ expressions.
|
|
2543
|
+
|
|
2544
|
+
The window at a given row includes the row itself and the
|
|
2545
|
+
`window_size - 1` elements before it.
|
|
2546
|
+
|
|
2547
|
+
.. versionchanged:: 1.21.0
|
|
2548
|
+
The `min_periods` parameter was renamed `min_samples`.
|
|
2549
|
+
|
|
2550
|
+
Parameters
|
|
2551
|
+
----------
|
|
2552
|
+
a
|
|
2553
|
+
Column name or Expression.
|
|
2554
|
+
b
|
|
2555
|
+
Column name or Expression.
|
|
2556
|
+
window_size
|
|
2557
|
+
The length of the window.
|
|
2558
|
+
min_samples
|
|
2559
|
+
The number of values in the window that should be non-null before computing
|
|
2560
|
+
a result. If None, it will be set equal to window size.
|
|
2561
|
+
ddof
|
|
2562
|
+
Delta degrees of freedom. The divisor used in calculations
|
|
2563
|
+
is `N - ddof`, where `N` represents the number of elements.
|
|
2564
|
+
"""
|
|
2565
|
+
if min_samples is None:
|
|
2566
|
+
min_samples = window_size
|
|
2567
|
+
if isinstance(a, str):
|
|
2568
|
+
a = F.col(a)
|
|
2569
|
+
if isinstance(b, str):
|
|
2570
|
+
b = F.col(b)
|
|
2571
|
+
return wrap_expr(
|
|
2572
|
+
plr.rolling_cov(a._pyexpr, b._pyexpr, window_size, min_samples, ddof)
|
|
2573
|
+
)
|
|
2574
|
+
|
|
2575
|
+
|
|
2576
|
+
@deprecate_renamed_parameter("min_periods", "min_samples", version="1.21.0")
|
|
2577
|
+
def rolling_corr(
|
|
2578
|
+
a: str | Expr,
|
|
2579
|
+
b: str | Expr,
|
|
2580
|
+
*,
|
|
2581
|
+
window_size: int,
|
|
2582
|
+
min_samples: int | None = None,
|
|
2583
|
+
ddof: int = 1,
|
|
2584
|
+
) -> Expr:
|
|
2585
|
+
"""
|
|
2586
|
+
Compute the rolling correlation between two columns/ expressions.
|
|
2587
|
+
|
|
2588
|
+
The window at a given row includes the row itself and the
|
|
2589
|
+
`window_size - 1` elements before it.
|
|
2590
|
+
|
|
2591
|
+
.. versionchanged:: 1.21.0
|
|
2592
|
+
The `min_periods` parameter was renamed `min_samples`.
|
|
2593
|
+
|
|
2594
|
+
Parameters
|
|
2595
|
+
----------
|
|
2596
|
+
a
|
|
2597
|
+
Column name or Expression.
|
|
2598
|
+
b
|
|
2599
|
+
Column name or Expression.
|
|
2600
|
+
window_size
|
|
2601
|
+
The length of the window.
|
|
2602
|
+
min_samples
|
|
2603
|
+
The number of values in the window that should be non-null before computing
|
|
2604
|
+
a result. If None, it will be set equal to window size.
|
|
2605
|
+
ddof
|
|
2606
|
+
Delta degrees of freedom. The divisor used in calculations
|
|
2607
|
+
is `N - ddof`, where `N` represents the number of elements.
|
|
2608
|
+
"""
|
|
2609
|
+
if min_samples is None:
|
|
2610
|
+
min_samples = window_size
|
|
2611
|
+
if isinstance(a, str):
|
|
2612
|
+
a = F.col(a)
|
|
2613
|
+
if isinstance(b, str):
|
|
2614
|
+
b = F.col(b)
|
|
2615
|
+
return wrap_expr(
|
|
2616
|
+
plr.rolling_corr(a._pyexpr, b._pyexpr, window_size, min_samples, ddof)
|
|
2617
|
+
)
|
|
2618
|
+
|
|
2619
|
+
|
|
2620
|
+
@overload
|
|
2621
|
+
def sql_expr(sql: str) -> Expr: # type: ignore[overload-overlap]
|
|
2622
|
+
...
|
|
2623
|
+
|
|
2624
|
+
|
|
2625
|
+
@overload
|
|
2626
|
+
def sql_expr(sql: Sequence[str]) -> list[Expr]: ...
|
|
2627
|
+
|
|
2628
|
+
|
|
2629
|
+
def sql_expr(sql: str | Sequence[str]) -> Expr | list[Expr]:
|
|
2630
|
+
"""
|
|
2631
|
+
Parse one or more SQL expressions to Polars expression(s).
|
|
2632
|
+
|
|
2633
|
+
Parameters
|
|
2634
|
+
----------
|
|
2635
|
+
sql
|
|
2636
|
+
One or more SQL expressions.
|
|
2637
|
+
|
|
2638
|
+
Examples
|
|
2639
|
+
--------
|
|
2640
|
+
Parse a single SQL expression:
|
|
2641
|
+
|
|
2642
|
+
>>> df = pl.DataFrame({"a": [2, 1]})
|
|
2643
|
+
>>> expr = pl.sql_expr("MAX(a)")
|
|
2644
|
+
>>> df.select(expr)
|
|
2645
|
+
shape: (1, 1)
|
|
2646
|
+
┌─────┐
|
|
2647
|
+
│ a │
|
|
2648
|
+
│ --- │
|
|
2649
|
+
│ i64 │
|
|
2650
|
+
╞═════╡
|
|
2651
|
+
│ 2 │
|
|
2652
|
+
└─────┘
|
|
2653
|
+
|
|
2654
|
+
Parse multiple SQL expressions:
|
|
2655
|
+
|
|
2656
|
+
>>> df.with_columns(
|
|
2657
|
+
... *pl.sql_expr(["POWER(a,a) AS a_a", "CAST(a AS TEXT) AS a_txt"]),
|
|
2658
|
+
... )
|
|
2659
|
+
shape: (2, 3)
|
|
2660
|
+
┌─────┬─────┬───────┐
|
|
2661
|
+
│ a ┆ a_a ┆ a_txt │
|
|
2662
|
+
│ --- ┆ --- ┆ --- │
|
|
2663
|
+
│ i64 ┆ i64 ┆ str │
|
|
2664
|
+
╞═════╪═════╪═══════╡
|
|
2665
|
+
│ 2 ┆ 4 ┆ 2 │
|
|
2666
|
+
│ 1 ┆ 1 ┆ 1 │
|
|
2667
|
+
└─────┴─────┴───────┘
|
|
2668
|
+
"""
|
|
2669
|
+
if isinstance(sql, str):
|
|
2670
|
+
return wrap_expr(plr.sql_expr(sql))
|
|
2671
|
+
else:
|
|
2672
|
+
return [wrap_expr(plr.sql_expr(q)) for q in sql]
|
|
2673
|
+
|
|
2674
|
+
|
|
2675
|
+
@unstable()
|
|
2676
|
+
def row_index(name: str = "index") -> pl.Expr:
|
|
2677
|
+
"""
|
|
2678
|
+
Generates a sequence of integers.
|
|
2679
|
+
|
|
2680
|
+
The length of the returned sequence will match the context length, and the
|
|
2681
|
+
datatype will match the one returned by `get_index_dtype()`.
|
|
2682
|
+
|
|
2683
|
+
.. warning::
|
|
2684
|
+
This functionality is considered **unstable**. It may be changed
|
|
2685
|
+
at any point without it being considered a breaking change.
|
|
2686
|
+
|
|
2687
|
+
If you would like to generate sequences with custom offsets / length /
|
|
2688
|
+
step size / datatypes, it is recommended to use `int_range` instead.
|
|
2689
|
+
|
|
2690
|
+
Parameters
|
|
2691
|
+
----------
|
|
2692
|
+
name
|
|
2693
|
+
Name of the returned column.
|
|
2694
|
+
|
|
2695
|
+
Returns
|
|
2696
|
+
-------
|
|
2697
|
+
Expr
|
|
2698
|
+
Column of integers.
|
|
2699
|
+
|
|
2700
|
+
See Also
|
|
2701
|
+
--------
|
|
2702
|
+
int_range : Generate a range of integers.
|
|
2703
|
+
|
|
2704
|
+
Examples
|
|
2705
|
+
--------
|
|
2706
|
+
>>> df = pl.DataFrame({"x": ["A", "A", "B", "B", "B"]})
|
|
2707
|
+
>>> df.with_columns(pl.row_index(), pl.row_index("another_index"))
|
|
2708
|
+
shape: (5, 3)
|
|
2709
|
+
┌─────┬───────┬───────────────┐
|
|
2710
|
+
│ x ┆ index ┆ another_index │
|
|
2711
|
+
│ --- ┆ --- ┆ --- │
|
|
2712
|
+
│ str ┆ u32 ┆ u32 │
|
|
2713
|
+
╞═════╪═══════╪═══════════════╡
|
|
2714
|
+
│ A ┆ 0 ┆ 0 │
|
|
2715
|
+
│ A ┆ 1 ┆ 1 │
|
|
2716
|
+
│ B ┆ 2 ┆ 2 │
|
|
2717
|
+
│ B ┆ 3 ┆ 3 │
|
|
2718
|
+
│ B ┆ 4 ┆ 4 │
|
|
2719
|
+
└─────┴───────┴───────────────┘
|
|
2720
|
+
>>> df.group_by("x").agg(pl.row_index()).sort("x")
|
|
2721
|
+
shape: (2, 2)
|
|
2722
|
+
┌─────┬───────────┐
|
|
2723
|
+
│ x ┆ index │
|
|
2724
|
+
│ --- ┆ --- │
|
|
2725
|
+
│ str ┆ list[u32] │
|
|
2726
|
+
╞═════╪═══════════╡
|
|
2727
|
+
│ A ┆ [0, 1] │
|
|
2728
|
+
│ B ┆ [0, 1, 2] │
|
|
2729
|
+
└─────┴───────────┘
|
|
2730
|
+
>>> df.select(pl.row_index())
|
|
2731
|
+
shape: (5, 1)
|
|
2732
|
+
┌───────┐
|
|
2733
|
+
│ index │
|
|
2734
|
+
│ --- │
|
|
2735
|
+
│ u32 │
|
|
2736
|
+
╞═══════╡
|
|
2737
|
+
│ 0 │
|
|
2738
|
+
│ 1 │
|
|
2739
|
+
│ 2 │
|
|
2740
|
+
│ 3 │
|
|
2741
|
+
│ 4 │
|
|
2742
|
+
└───────┘
|
|
2743
|
+
"""
|
|
2744
|
+
# Notes
|
|
2745
|
+
# * Dispatching to `int_range` means that we cannot accept an offset
|
|
2746
|
+
# parameter, as unlike `DataFrame.with_row_index()`, `int_range` will simply
|
|
2747
|
+
# truncate instead of raising an error.
|
|
2748
|
+
return F.int_range(
|
|
2749
|
+
F.len(),
|
|
2750
|
+
dtype=get_index_type(),
|
|
2751
|
+
).alias(name)
|