polars-runtime-compat 1.34.0b3__cp39-abi3-win_amd64.whl → 1.34.0b4__cp39-abi3-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of polars-runtime-compat might be problematic. Click here for more details.
- _polars_runtime_compat/_polars_runtime_compat.pyd +0 -0
- {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b4.dist-info}/METADATA +1 -1
- polars_runtime_compat-1.34.0b4.dist-info/RECORD +6 -0
- polars/__init__.py +0 -528
- polars/_cpu_check.py +0 -265
- polars/_dependencies.py +0 -355
- polars/_plr.py +0 -99
- polars/_plr.pyi +0 -2496
- polars/_reexport.py +0 -23
- polars/_typing.py +0 -478
- polars/_utils/__init__.py +0 -37
- polars/_utils/async_.py +0 -102
- polars/_utils/cache.py +0 -176
- polars/_utils/cloud.py +0 -40
- polars/_utils/constants.py +0 -29
- polars/_utils/construction/__init__.py +0 -46
- polars/_utils/construction/dataframe.py +0 -1397
- polars/_utils/construction/other.py +0 -72
- polars/_utils/construction/series.py +0 -560
- polars/_utils/construction/utils.py +0 -118
- polars/_utils/convert.py +0 -224
- polars/_utils/deprecation.py +0 -406
- polars/_utils/getitem.py +0 -457
- polars/_utils/logging.py +0 -11
- polars/_utils/nest_asyncio.py +0 -264
- polars/_utils/parquet.py +0 -15
- polars/_utils/parse/__init__.py +0 -12
- polars/_utils/parse/expr.py +0 -242
- polars/_utils/polars_version.py +0 -19
- polars/_utils/pycapsule.py +0 -53
- polars/_utils/scan.py +0 -27
- polars/_utils/serde.py +0 -63
- polars/_utils/slice.py +0 -215
- polars/_utils/udfs.py +0 -1251
- polars/_utils/unstable.py +0 -63
- polars/_utils/various.py +0 -782
- polars/_utils/wrap.py +0 -25
- polars/api.py +0 -370
- polars/catalog/__init__.py +0 -0
- polars/catalog/unity/__init__.py +0 -19
- polars/catalog/unity/client.py +0 -733
- polars/catalog/unity/models.py +0 -152
- polars/config.py +0 -1571
- polars/convert/__init__.py +0 -25
- polars/convert/general.py +0 -1046
- polars/convert/normalize.py +0 -261
- polars/dataframe/__init__.py +0 -5
- polars/dataframe/_html.py +0 -186
- polars/dataframe/frame.py +0 -12582
- polars/dataframe/group_by.py +0 -1067
- polars/dataframe/plotting.py +0 -257
- polars/datatype_expr/__init__.py +0 -5
- polars/datatype_expr/array.py +0 -56
- polars/datatype_expr/datatype_expr.py +0 -304
- polars/datatype_expr/list.py +0 -18
- polars/datatype_expr/struct.py +0 -69
- polars/datatypes/__init__.py +0 -122
- polars/datatypes/_parse.py +0 -195
- polars/datatypes/_utils.py +0 -48
- polars/datatypes/classes.py +0 -1213
- polars/datatypes/constants.py +0 -11
- polars/datatypes/constructor.py +0 -172
- polars/datatypes/convert.py +0 -366
- polars/datatypes/group.py +0 -130
- polars/exceptions.py +0 -230
- polars/expr/__init__.py +0 -7
- polars/expr/array.py +0 -964
- polars/expr/binary.py +0 -346
- polars/expr/categorical.py +0 -306
- polars/expr/datetime.py +0 -2620
- polars/expr/expr.py +0 -11272
- polars/expr/list.py +0 -1408
- polars/expr/meta.py +0 -444
- polars/expr/name.py +0 -321
- polars/expr/string.py +0 -3045
- polars/expr/struct.py +0 -357
- polars/expr/whenthen.py +0 -185
- polars/functions/__init__.py +0 -193
- polars/functions/aggregation/__init__.py +0 -33
- polars/functions/aggregation/horizontal.py +0 -298
- polars/functions/aggregation/vertical.py +0 -341
- polars/functions/as_datatype.py +0 -848
- polars/functions/business.py +0 -138
- polars/functions/col.py +0 -384
- polars/functions/datatype.py +0 -121
- polars/functions/eager.py +0 -524
- polars/functions/escape_regex.py +0 -29
- polars/functions/lazy.py +0 -2751
- polars/functions/len.py +0 -68
- polars/functions/lit.py +0 -210
- polars/functions/random.py +0 -22
- polars/functions/range/__init__.py +0 -19
- polars/functions/range/_utils.py +0 -15
- polars/functions/range/date_range.py +0 -303
- polars/functions/range/datetime_range.py +0 -370
- polars/functions/range/int_range.py +0 -348
- polars/functions/range/linear_space.py +0 -311
- polars/functions/range/time_range.py +0 -287
- polars/functions/repeat.py +0 -301
- polars/functions/whenthen.py +0 -353
- polars/interchange/__init__.py +0 -10
- polars/interchange/buffer.py +0 -77
- polars/interchange/column.py +0 -190
- polars/interchange/dataframe.py +0 -230
- polars/interchange/from_dataframe.py +0 -328
- polars/interchange/protocol.py +0 -303
- polars/interchange/utils.py +0 -170
- polars/io/__init__.py +0 -64
- polars/io/_utils.py +0 -317
- polars/io/avro.py +0 -49
- polars/io/clipboard.py +0 -36
- polars/io/cloud/__init__.py +0 -17
- polars/io/cloud/_utils.py +0 -80
- polars/io/cloud/credential_provider/__init__.py +0 -17
- polars/io/cloud/credential_provider/_builder.py +0 -520
- polars/io/cloud/credential_provider/_providers.py +0 -618
- polars/io/csv/__init__.py +0 -9
- polars/io/csv/_utils.py +0 -38
- polars/io/csv/batched_reader.py +0 -142
- polars/io/csv/functions.py +0 -1495
- polars/io/database/__init__.py +0 -6
- polars/io/database/_arrow_registry.py +0 -70
- polars/io/database/_cursor_proxies.py +0 -147
- polars/io/database/_executor.py +0 -578
- polars/io/database/_inference.py +0 -314
- polars/io/database/_utils.py +0 -144
- polars/io/database/functions.py +0 -516
- polars/io/delta.py +0 -499
- polars/io/iceberg/__init__.py +0 -3
- polars/io/iceberg/_utils.py +0 -697
- polars/io/iceberg/dataset.py +0 -556
- polars/io/iceberg/functions.py +0 -151
- polars/io/ipc/__init__.py +0 -8
- polars/io/ipc/functions.py +0 -514
- polars/io/json/__init__.py +0 -3
- polars/io/json/read.py +0 -101
- polars/io/ndjson.py +0 -332
- polars/io/parquet/__init__.py +0 -17
- polars/io/parquet/field_overwrites.py +0 -140
- polars/io/parquet/functions.py +0 -722
- polars/io/partition.py +0 -491
- polars/io/plugins.py +0 -187
- polars/io/pyarrow_dataset/__init__.py +0 -5
- polars/io/pyarrow_dataset/anonymous_scan.py +0 -109
- polars/io/pyarrow_dataset/functions.py +0 -79
- polars/io/scan_options/__init__.py +0 -5
- polars/io/scan_options/_options.py +0 -59
- polars/io/scan_options/cast_options.py +0 -126
- polars/io/spreadsheet/__init__.py +0 -6
- polars/io/spreadsheet/_utils.py +0 -52
- polars/io/spreadsheet/_write_utils.py +0 -647
- polars/io/spreadsheet/functions.py +0 -1323
- polars/lazyframe/__init__.py +0 -9
- polars/lazyframe/engine_config.py +0 -61
- polars/lazyframe/frame.py +0 -8564
- polars/lazyframe/group_by.py +0 -669
- polars/lazyframe/in_process.py +0 -42
- polars/lazyframe/opt_flags.py +0 -333
- polars/meta/__init__.py +0 -14
- polars/meta/build.py +0 -33
- polars/meta/index_type.py +0 -27
- polars/meta/thread_pool.py +0 -50
- polars/meta/versions.py +0 -120
- polars/ml/__init__.py +0 -0
- polars/ml/torch.py +0 -213
- polars/ml/utilities.py +0 -30
- polars/plugins.py +0 -155
- polars/py.typed +0 -0
- polars/pyproject.toml +0 -103
- polars/schema.py +0 -265
- polars/selectors.py +0 -3117
- polars/series/__init__.py +0 -5
- polars/series/array.py +0 -776
- polars/series/binary.py +0 -254
- polars/series/categorical.py +0 -246
- polars/series/datetime.py +0 -2275
- polars/series/list.py +0 -1087
- polars/series/plotting.py +0 -191
- polars/series/series.py +0 -9197
- polars/series/string.py +0 -2367
- polars/series/struct.py +0 -154
- polars/series/utils.py +0 -191
- polars/sql/__init__.py +0 -7
- polars/sql/context.py +0 -677
- polars/sql/functions.py +0 -139
- polars/string_cache.py +0 -185
- polars/testing/__init__.py +0 -13
- polars/testing/asserts/__init__.py +0 -9
- polars/testing/asserts/frame.py +0 -231
- polars/testing/asserts/series.py +0 -219
- polars/testing/asserts/utils.py +0 -12
- polars/testing/parametric/__init__.py +0 -33
- polars/testing/parametric/profiles.py +0 -107
- polars/testing/parametric/strategies/__init__.py +0 -22
- polars/testing/parametric/strategies/_utils.py +0 -14
- polars/testing/parametric/strategies/core.py +0 -615
- polars/testing/parametric/strategies/data.py +0 -452
- polars/testing/parametric/strategies/dtype.py +0 -436
- polars/testing/parametric/strategies/legacy.py +0 -169
- polars/type_aliases.py +0 -24
- polars_runtime_compat-1.34.0b3.dist-info/RECORD +0 -203
- {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b4.dist-info}/WHEEL +0 -0
- {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b4.dist-info}/licenses/LICENSE +0 -0
polars/expr/string.py
DELETED
|
@@ -1,3045 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import warnings
|
|
4
|
-
from collections.abc import Mapping
|
|
5
|
-
from typing import TYPE_CHECKING
|
|
6
|
-
|
|
7
|
-
import polars._reexport as pl
|
|
8
|
-
from polars import functions as F
|
|
9
|
-
from polars._utils.deprecation import deprecate_nonkeyword_arguments, deprecated
|
|
10
|
-
from polars._utils.parse import parse_into_expression
|
|
11
|
-
from polars._utils.unstable import unstable
|
|
12
|
-
from polars._utils.various import (
|
|
13
|
-
find_stacklevel,
|
|
14
|
-
issue_warning,
|
|
15
|
-
no_default,
|
|
16
|
-
qualified_type_name,
|
|
17
|
-
)
|
|
18
|
-
from polars._utils.wrap import wrap_expr
|
|
19
|
-
from polars.datatypes import Date, Datetime, Int64, Time, parse_into_datatype_expr
|
|
20
|
-
from polars.exceptions import ChronoFormatWarning
|
|
21
|
-
|
|
22
|
-
if TYPE_CHECKING:
|
|
23
|
-
import sys
|
|
24
|
-
|
|
25
|
-
from polars import Expr
|
|
26
|
-
from polars._typing import (
|
|
27
|
-
Ambiguous,
|
|
28
|
-
IntoExpr,
|
|
29
|
-
IntoExprColumn,
|
|
30
|
-
PolarsDataType,
|
|
31
|
-
PolarsIntegerType,
|
|
32
|
-
PolarsTemporalType,
|
|
33
|
-
TimeUnit,
|
|
34
|
-
TransferEncoding,
|
|
35
|
-
UnicodeForm,
|
|
36
|
-
)
|
|
37
|
-
from polars._utils.various import NoDefault
|
|
38
|
-
|
|
39
|
-
if sys.version_info >= (3, 13):
|
|
40
|
-
from warnings import deprecated
|
|
41
|
-
else:
|
|
42
|
-
from typing_extensions import deprecated # noqa: TC004
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
class ExprStringNameSpace:
|
|
46
|
-
"""Namespace for string related expressions."""
|
|
47
|
-
|
|
48
|
-
_accessor = "str"
|
|
49
|
-
|
|
50
|
-
def __init__(self, expr: Expr) -> None:
|
|
51
|
-
self._pyexpr = expr._pyexpr
|
|
52
|
-
|
|
53
|
-
def to_date(
|
|
54
|
-
self,
|
|
55
|
-
format: str | None = None,
|
|
56
|
-
*,
|
|
57
|
-
strict: bool = True,
|
|
58
|
-
exact: bool = True,
|
|
59
|
-
cache: bool = True,
|
|
60
|
-
) -> Expr:
|
|
61
|
-
"""
|
|
62
|
-
Convert a String column into a Date column.
|
|
63
|
-
|
|
64
|
-
Parameters
|
|
65
|
-
----------
|
|
66
|
-
format
|
|
67
|
-
Format to use for conversion. Refer to the `chrono crate documentation
|
|
68
|
-
<https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
|
|
69
|
-
for the full specification. Example: `"%Y-%m-%d"`.
|
|
70
|
-
If set to None (default), the format is inferred from the data.
|
|
71
|
-
strict
|
|
72
|
-
Raise an error if any conversion fails.
|
|
73
|
-
exact
|
|
74
|
-
Require an exact format match. If False, allow the format to match anywhere
|
|
75
|
-
in the target string.
|
|
76
|
-
|
|
77
|
-
.. note::
|
|
78
|
-
Using `exact=False` introduces a performance penalty - cleaning your
|
|
79
|
-
data beforehand will almost certainly be more performant.
|
|
80
|
-
cache
|
|
81
|
-
Use a cache of unique, converted dates to apply the conversion.
|
|
82
|
-
|
|
83
|
-
Examples
|
|
84
|
-
--------
|
|
85
|
-
>>> s = pl.Series(["2020/01/01", "2020/02/01", "2020/03/01"])
|
|
86
|
-
>>> s.str.to_date()
|
|
87
|
-
shape: (3,)
|
|
88
|
-
Series: '' [date]
|
|
89
|
-
[
|
|
90
|
-
2020-01-01
|
|
91
|
-
2020-02-01
|
|
92
|
-
2020-03-01
|
|
93
|
-
]
|
|
94
|
-
"""
|
|
95
|
-
_validate_format_argument(format)
|
|
96
|
-
return wrap_expr(self._pyexpr.str_to_date(format, strict, exact, cache))
|
|
97
|
-
|
|
98
|
-
def to_datetime(
|
|
99
|
-
self,
|
|
100
|
-
format: str | None = None,
|
|
101
|
-
*,
|
|
102
|
-
time_unit: TimeUnit | None = None,
|
|
103
|
-
time_zone: str | None = None,
|
|
104
|
-
strict: bool = True,
|
|
105
|
-
exact: bool = True,
|
|
106
|
-
cache: bool = True,
|
|
107
|
-
ambiguous: Ambiguous | Expr = "raise",
|
|
108
|
-
) -> Expr:
|
|
109
|
-
"""
|
|
110
|
-
Convert a String column into a Datetime column.
|
|
111
|
-
|
|
112
|
-
Parameters
|
|
113
|
-
----------
|
|
114
|
-
format
|
|
115
|
-
Format to use for conversion. Refer to the `chrono crate documentation
|
|
116
|
-
<https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
|
|
117
|
-
for the full specification. Example: `"%Y-%m-%d %H:%M:%S"`.
|
|
118
|
-
If set to None (default), the format is inferred from the data.
|
|
119
|
-
time_unit : {None, 'us', 'ns', 'ms'}
|
|
120
|
-
Unit of time for the resulting Datetime column. If set to None (default),
|
|
121
|
-
the time unit is inferred from the format string if given, eg:
|
|
122
|
-
`"%F %T%.3f"` => `Datetime("ms")`. If no fractional second component is
|
|
123
|
-
found, the default is `"us"`.
|
|
124
|
-
time_zone
|
|
125
|
-
Time zone for the resulting Datetime column. Rules are:
|
|
126
|
-
|
|
127
|
-
- If inputs are tz-naive and `time_zone` is None, the result time zone is
|
|
128
|
-
`None`.
|
|
129
|
-
- If inputs are offset-aware and `time_zone` is None, inputs are converted
|
|
130
|
-
to `'UTC'` and the result time zone is `'UTC'`.
|
|
131
|
-
- If inputs are offset-aware and `time_zone` is given, inputs are converted
|
|
132
|
-
to `time_zone` and the result time zone is `time_zone`.
|
|
133
|
-
- If inputs are tz-naive and `time_zone` is given, input time zones are
|
|
134
|
-
replaced with (not converted to!) `time_zone`, and the result time zone
|
|
135
|
-
is `time_zone`.
|
|
136
|
-
strict
|
|
137
|
-
Raise an error if any conversion fails.
|
|
138
|
-
exact
|
|
139
|
-
Require an exact format match. If False, allow the format to match anywhere
|
|
140
|
-
in the target string.
|
|
141
|
-
|
|
142
|
-
.. note::
|
|
143
|
-
Using `exact=False` introduces a performance penalty - cleaning your
|
|
144
|
-
data beforehand will almost certainly be more performant.
|
|
145
|
-
cache
|
|
146
|
-
Use a cache of unique, converted datetimes to apply the conversion.
|
|
147
|
-
ambiguous
|
|
148
|
-
Determine how to deal with ambiguous datetimes:
|
|
149
|
-
|
|
150
|
-
- `'raise'` (default): raise
|
|
151
|
-
- `'earliest'`: use the earliest datetime
|
|
152
|
-
- `'latest'`: use the latest datetime
|
|
153
|
-
- `'null'`: set to null
|
|
154
|
-
|
|
155
|
-
Examples
|
|
156
|
-
--------
|
|
157
|
-
>>> s = pl.Series(["2020-01-01 01:00Z", "2020-01-01 02:00Z"])
|
|
158
|
-
>>> s.str.to_datetime("%Y-%m-%d %H:%M%#z")
|
|
159
|
-
shape: (2,)
|
|
160
|
-
Series: '' [datetime[μs, UTC]]
|
|
161
|
-
[
|
|
162
|
-
2020-01-01 01:00:00 UTC
|
|
163
|
-
2020-01-01 02:00:00 UTC
|
|
164
|
-
]
|
|
165
|
-
"""
|
|
166
|
-
_validate_format_argument(format)
|
|
167
|
-
if not isinstance(ambiguous, pl.Expr):
|
|
168
|
-
ambiguous = F.lit(ambiguous)
|
|
169
|
-
return wrap_expr(
|
|
170
|
-
self._pyexpr.str_to_datetime(
|
|
171
|
-
format,
|
|
172
|
-
time_unit,
|
|
173
|
-
time_zone,
|
|
174
|
-
strict,
|
|
175
|
-
exact,
|
|
176
|
-
cache,
|
|
177
|
-
ambiguous._pyexpr,
|
|
178
|
-
)
|
|
179
|
-
)
|
|
180
|
-
|
|
181
|
-
def to_time(
|
|
182
|
-
self,
|
|
183
|
-
format: str | None = None,
|
|
184
|
-
*,
|
|
185
|
-
strict: bool = True,
|
|
186
|
-
cache: bool = True,
|
|
187
|
-
) -> Expr:
|
|
188
|
-
"""
|
|
189
|
-
Convert a String column into a Time column.
|
|
190
|
-
|
|
191
|
-
Parameters
|
|
192
|
-
----------
|
|
193
|
-
format
|
|
194
|
-
Format to use for conversion. Refer to the `chrono crate documentation
|
|
195
|
-
<https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
|
|
196
|
-
for the full specification. Example: `"%H:%M:%S"`.
|
|
197
|
-
If set to None (default), the format is inferred from the data.
|
|
198
|
-
strict
|
|
199
|
-
Raise an error if any conversion fails.
|
|
200
|
-
cache
|
|
201
|
-
Use a cache of unique, converted times to apply the conversion.
|
|
202
|
-
|
|
203
|
-
Examples
|
|
204
|
-
--------
|
|
205
|
-
>>> s = pl.Series(["01:00", "02:00", "03:00"])
|
|
206
|
-
>>> s.str.to_time("%H:%M")
|
|
207
|
-
shape: (3,)
|
|
208
|
-
Series: '' [time]
|
|
209
|
-
[
|
|
210
|
-
01:00:00
|
|
211
|
-
02:00:00
|
|
212
|
-
03:00:00
|
|
213
|
-
]
|
|
214
|
-
"""
|
|
215
|
-
_validate_format_argument(format)
|
|
216
|
-
return wrap_expr(self._pyexpr.str_to_time(format, strict, cache))
|
|
217
|
-
|
|
218
|
-
def strptime(
|
|
219
|
-
self,
|
|
220
|
-
dtype: PolarsTemporalType,
|
|
221
|
-
format: str | None = None,
|
|
222
|
-
*,
|
|
223
|
-
strict: bool = True,
|
|
224
|
-
exact: bool = True,
|
|
225
|
-
cache: bool = True,
|
|
226
|
-
ambiguous: Ambiguous | Expr = "raise",
|
|
227
|
-
) -> Expr:
|
|
228
|
-
"""
|
|
229
|
-
Convert a String column into a Date/Datetime/Time column.
|
|
230
|
-
|
|
231
|
-
Parameters
|
|
232
|
-
----------
|
|
233
|
-
dtype
|
|
234
|
-
The data type to convert into. Can be either Date, Datetime, or Time.
|
|
235
|
-
format
|
|
236
|
-
Format to use for conversion. Refer to the `chrono crate documentation
|
|
237
|
-
<https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
|
|
238
|
-
for the full specification. Example: `"%Y-%m-%d %H:%M:%S"`.
|
|
239
|
-
If set to None (default), the format is inferred from the data.
|
|
240
|
-
strict
|
|
241
|
-
Raise an error if any conversion fails.
|
|
242
|
-
exact
|
|
243
|
-
Require an exact format match. If False, allow the format to match anywhere
|
|
244
|
-
in the target string. Conversion to the Time type is always exact.
|
|
245
|
-
|
|
246
|
-
.. note::
|
|
247
|
-
Using `exact=False` introduces a performance penalty - cleaning your
|
|
248
|
-
data beforehand will almost certainly be more performant.
|
|
249
|
-
cache
|
|
250
|
-
Use a cache of unique, converted dates to apply the datetime conversion.
|
|
251
|
-
ambiguous
|
|
252
|
-
Determine how to deal with ambiguous datetimes:
|
|
253
|
-
|
|
254
|
-
- `'raise'` (default): raise
|
|
255
|
-
- `'earliest'`: use the earliest datetime
|
|
256
|
-
- `'latest'`: use the latest datetime
|
|
257
|
-
- `'null'`: set to null
|
|
258
|
-
|
|
259
|
-
Notes
|
|
260
|
-
-----
|
|
261
|
-
When converting to a Datetime type, the time unit is inferred from the format
|
|
262
|
-
string if given, eg: `"%F %T%.3f"` => `Datetime("ms")`. If no fractional
|
|
263
|
-
second component is found, the default is `"us"`.
|
|
264
|
-
|
|
265
|
-
Examples
|
|
266
|
-
--------
|
|
267
|
-
Dealing with a consistent format:
|
|
268
|
-
|
|
269
|
-
>>> s = pl.Series(["2020-01-01 01:00Z", "2020-01-01 02:00Z"])
|
|
270
|
-
>>> s.str.strptime(pl.Datetime, "%Y-%m-%d %H:%M%#z")
|
|
271
|
-
shape: (2,)
|
|
272
|
-
Series: '' [datetime[μs, UTC]]
|
|
273
|
-
[
|
|
274
|
-
2020-01-01 01:00:00 UTC
|
|
275
|
-
2020-01-01 02:00:00 UTC
|
|
276
|
-
]
|
|
277
|
-
|
|
278
|
-
Dealing with different formats.
|
|
279
|
-
|
|
280
|
-
>>> s = pl.Series(
|
|
281
|
-
... "date",
|
|
282
|
-
... [
|
|
283
|
-
... "2021-04-22",
|
|
284
|
-
... "2022-01-04 00:00:00",
|
|
285
|
-
... "01/31/22",
|
|
286
|
-
... "Sun Jul 8 00:34:60 2001",
|
|
287
|
-
... ],
|
|
288
|
-
... )
|
|
289
|
-
>>> s.to_frame().select(
|
|
290
|
-
... pl.coalesce(
|
|
291
|
-
... pl.col("date").str.strptime(pl.Date, "%F", strict=False),
|
|
292
|
-
... pl.col("date").str.strptime(pl.Date, "%F %T", strict=False),
|
|
293
|
-
... pl.col("date").str.strptime(pl.Date, "%D", strict=False),
|
|
294
|
-
... pl.col("date").str.strptime(pl.Date, "%c", strict=False),
|
|
295
|
-
... )
|
|
296
|
-
... ).to_series()
|
|
297
|
-
shape: (4,)
|
|
298
|
-
Series: 'date' [date]
|
|
299
|
-
[
|
|
300
|
-
2021-04-22
|
|
301
|
-
2022-01-04
|
|
302
|
-
2022-01-31
|
|
303
|
-
2001-07-08
|
|
304
|
-
]
|
|
305
|
-
"""
|
|
306
|
-
if dtype == Date:
|
|
307
|
-
return self.to_date(format, strict=strict, exact=exact, cache=cache)
|
|
308
|
-
elif dtype == Datetime:
|
|
309
|
-
time_unit = getattr(dtype, "time_unit", None)
|
|
310
|
-
time_zone = getattr(dtype, "time_zone", None)
|
|
311
|
-
return self.to_datetime(
|
|
312
|
-
format,
|
|
313
|
-
time_unit=time_unit,
|
|
314
|
-
time_zone=time_zone,
|
|
315
|
-
strict=strict,
|
|
316
|
-
exact=exact,
|
|
317
|
-
cache=cache,
|
|
318
|
-
ambiguous=ambiguous,
|
|
319
|
-
)
|
|
320
|
-
elif dtype == Time:
|
|
321
|
-
return self.to_time(format, strict=strict, cache=cache)
|
|
322
|
-
else:
|
|
323
|
-
msg = "`dtype` must be of type {Date, Datetime, Time}"
|
|
324
|
-
raise ValueError(msg)
|
|
325
|
-
|
|
326
|
-
@deprecate_nonkeyword_arguments(allowed_args=["self"], version="1.20.0")
|
|
327
|
-
@unstable()
|
|
328
|
-
def to_decimal(self, *, scale: int) -> Expr:
|
|
329
|
-
"""
|
|
330
|
-
Convert a String column into a Decimal column.
|
|
331
|
-
|
|
332
|
-
.. warning::
|
|
333
|
-
This functionality is considered **unstable**. It may be changed
|
|
334
|
-
at any point without it being considered a breaking change.
|
|
335
|
-
|
|
336
|
-
.. versionchanged:: 1.20.0
|
|
337
|
-
Parameter `inference_length` should now be passed as a keyword argument.
|
|
338
|
-
|
|
339
|
-
.. versionchanged:: 1.33.0
|
|
340
|
-
Parameter `inference_length` was removed and `scale` was made non-optional.
|
|
341
|
-
|
|
342
|
-
Parameters
|
|
343
|
-
----------
|
|
344
|
-
scale
|
|
345
|
-
Number of digits after the comma to use for the decimals.
|
|
346
|
-
|
|
347
|
-
Examples
|
|
348
|
-
--------
|
|
349
|
-
>>> df = pl.DataFrame(
|
|
350
|
-
... {
|
|
351
|
-
... "numbers": [
|
|
352
|
-
... "40.12",
|
|
353
|
-
... "3420.13",
|
|
354
|
-
... "120134.19",
|
|
355
|
-
... "3212.98",
|
|
356
|
-
... "12.90",
|
|
357
|
-
... "143.09",
|
|
358
|
-
... "143.9",
|
|
359
|
-
... ]
|
|
360
|
-
... }
|
|
361
|
-
... )
|
|
362
|
-
>>> df.with_columns(numbers_decimal=pl.col("numbers").str.to_decimal(scale=2))
|
|
363
|
-
shape: (7, 2)
|
|
364
|
-
┌───────────┬─────────────────┐
|
|
365
|
-
│ numbers ┆ numbers_decimal │
|
|
366
|
-
│ --- ┆ --- │
|
|
367
|
-
│ str ┆ decimal[38,2] │
|
|
368
|
-
╞═══════════╪═════════════════╡
|
|
369
|
-
│ 40.12 ┆ 40.12 │
|
|
370
|
-
│ 3420.13 ┆ 3420.13 │
|
|
371
|
-
│ 120134.19 ┆ 120134.19 │
|
|
372
|
-
│ 3212.98 ┆ 3212.98 │
|
|
373
|
-
│ 12.90 ┆ 12.90 │
|
|
374
|
-
│ 143.09 ┆ 143.09 │
|
|
375
|
-
│ 143.9 ┆ 143.90 │
|
|
376
|
-
└───────────┴─────────────────┘
|
|
377
|
-
"""
|
|
378
|
-
return wrap_expr(self._pyexpr.str_to_decimal(scale=scale))
|
|
379
|
-
|
|
380
|
-
def len_bytes(self) -> Expr:
|
|
381
|
-
"""
|
|
382
|
-
Return the length of each string as the number of bytes.
|
|
383
|
-
|
|
384
|
-
Returns
|
|
385
|
-
-------
|
|
386
|
-
Expr
|
|
387
|
-
Expression of data type :class:`UInt32`.
|
|
388
|
-
|
|
389
|
-
See Also
|
|
390
|
-
--------
|
|
391
|
-
len_chars
|
|
392
|
-
|
|
393
|
-
Notes
|
|
394
|
-
-----
|
|
395
|
-
When working with non-ASCII text, the length in bytes is not the same as the
|
|
396
|
-
length in characters. You may want to use :func:`len_chars` instead.
|
|
397
|
-
Note that :func:`len_bytes` is much more performant (_O(1)_) than
|
|
398
|
-
:func:`len_chars` (_O(n)_).
|
|
399
|
-
|
|
400
|
-
Examples
|
|
401
|
-
--------
|
|
402
|
-
>>> df = pl.DataFrame({"a": ["Café", "345", "東京", None]})
|
|
403
|
-
>>> df.with_columns(
|
|
404
|
-
... pl.col("a").str.len_bytes().alias("n_bytes"),
|
|
405
|
-
... pl.col("a").str.len_chars().alias("n_chars"),
|
|
406
|
-
... )
|
|
407
|
-
shape: (4, 3)
|
|
408
|
-
┌──────┬─────────┬─────────┐
|
|
409
|
-
│ a ┆ n_bytes ┆ n_chars │
|
|
410
|
-
│ --- ┆ --- ┆ --- │
|
|
411
|
-
│ str ┆ u32 ┆ u32 │
|
|
412
|
-
╞══════╪═════════╪═════════╡
|
|
413
|
-
│ Café ┆ 5 ┆ 4 │
|
|
414
|
-
│ 345 ┆ 3 ┆ 3 │
|
|
415
|
-
│ 東京 ┆ 6 ┆ 2 │
|
|
416
|
-
│ null ┆ null ┆ null │
|
|
417
|
-
└──────┴─────────┴─────────┘
|
|
418
|
-
"""
|
|
419
|
-
return wrap_expr(self._pyexpr.str_len_bytes())
|
|
420
|
-
|
|
421
|
-
def len_chars(self) -> Expr:
|
|
422
|
-
"""
|
|
423
|
-
Return the length of each string as the number of characters.
|
|
424
|
-
|
|
425
|
-
Returns
|
|
426
|
-
-------
|
|
427
|
-
Expr
|
|
428
|
-
Expression of data type :class:`UInt32`.
|
|
429
|
-
|
|
430
|
-
See Also
|
|
431
|
-
--------
|
|
432
|
-
len_bytes
|
|
433
|
-
|
|
434
|
-
Notes
|
|
435
|
-
-----
|
|
436
|
-
When working with ASCII text, use :func:`len_bytes` instead to achieve
|
|
437
|
-
equivalent output with much better performance:
|
|
438
|
-
:func:`len_bytes` runs in _O(1)_, while :func:`len_chars` runs in (_O(n)_).
|
|
439
|
-
|
|
440
|
-
A character is defined as a `Unicode scalar value`_. A single character is
|
|
441
|
-
represented by a single byte when working with ASCII text, and a maximum of
|
|
442
|
-
4 bytes otherwise.
|
|
443
|
-
|
|
444
|
-
.. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value
|
|
445
|
-
|
|
446
|
-
Examples
|
|
447
|
-
--------
|
|
448
|
-
>>> df = pl.DataFrame({"a": ["Café", "345", "東京", None]})
|
|
449
|
-
>>> df.with_columns(
|
|
450
|
-
... pl.col("a").str.len_chars().alias("n_chars"),
|
|
451
|
-
... pl.col("a").str.len_bytes().alias("n_bytes"),
|
|
452
|
-
... )
|
|
453
|
-
shape: (4, 3)
|
|
454
|
-
┌──────┬─────────┬─────────┐
|
|
455
|
-
│ a ┆ n_chars ┆ n_bytes │
|
|
456
|
-
│ --- ┆ --- ┆ --- │
|
|
457
|
-
│ str ┆ u32 ┆ u32 │
|
|
458
|
-
╞══════╪═════════╪═════════╡
|
|
459
|
-
│ Café ┆ 4 ┆ 5 │
|
|
460
|
-
│ 345 ┆ 3 ┆ 3 │
|
|
461
|
-
│ 東京 ┆ 2 ┆ 6 │
|
|
462
|
-
│ null ┆ null ┆ null │
|
|
463
|
-
└──────┴─────────┴─────────┘
|
|
464
|
-
"""
|
|
465
|
-
return wrap_expr(self._pyexpr.str_len_chars())
|
|
466
|
-
|
|
467
|
-
def to_uppercase(self) -> Expr:
|
|
468
|
-
"""
|
|
469
|
-
Modify strings to their uppercase equivalent.
|
|
470
|
-
|
|
471
|
-
Examples
|
|
472
|
-
--------
|
|
473
|
-
>>> df = pl.DataFrame({"foo": ["cat", "dog"]})
|
|
474
|
-
>>> df.with_columns(foo_upper=pl.col("foo").str.to_uppercase())
|
|
475
|
-
shape: (2, 2)
|
|
476
|
-
┌─────┬───────────┐
|
|
477
|
-
│ foo ┆ foo_upper │
|
|
478
|
-
│ --- ┆ --- │
|
|
479
|
-
│ str ┆ str │
|
|
480
|
-
╞═════╪═══════════╡
|
|
481
|
-
│ cat ┆ CAT │
|
|
482
|
-
│ dog ┆ DOG │
|
|
483
|
-
└─────┴───────────┘
|
|
484
|
-
"""
|
|
485
|
-
return wrap_expr(self._pyexpr.str_to_uppercase())
|
|
486
|
-
|
|
487
|
-
def to_lowercase(self) -> Expr:
|
|
488
|
-
"""
|
|
489
|
-
Modify strings to their lowercase equivalent.
|
|
490
|
-
|
|
491
|
-
Examples
|
|
492
|
-
--------
|
|
493
|
-
>>> df = pl.DataFrame({"foo": ["CAT", "DOG"]})
|
|
494
|
-
>>> df.with_columns(foo_lower=pl.col("foo").str.to_lowercase())
|
|
495
|
-
shape: (2, 2)
|
|
496
|
-
┌─────┬───────────┐
|
|
497
|
-
│ foo ┆ foo_lower │
|
|
498
|
-
│ --- ┆ --- │
|
|
499
|
-
│ str ┆ str │
|
|
500
|
-
╞═════╪═══════════╡
|
|
501
|
-
│ CAT ┆ cat │
|
|
502
|
-
│ DOG ┆ dog │
|
|
503
|
-
└─────┴───────────┘
|
|
504
|
-
"""
|
|
505
|
-
return wrap_expr(self._pyexpr.str_to_lowercase())
|
|
506
|
-
|
|
507
|
-
def to_titlecase(self) -> Expr:
|
|
508
|
-
"""
|
|
509
|
-
Modify strings to their titlecase equivalent.
|
|
510
|
-
|
|
511
|
-
Notes
|
|
512
|
-
-----
|
|
513
|
-
This is a form of case transform where the first letter of each word is
|
|
514
|
-
capitalized, with the rest of the word in lowercase. Non-alphanumeric
|
|
515
|
-
characters define the word boundaries.
|
|
516
|
-
|
|
517
|
-
Examples
|
|
518
|
-
--------
|
|
519
|
-
>>> df = pl.DataFrame(
|
|
520
|
-
... {
|
|
521
|
-
... "quotes": [
|
|
522
|
-
... "'e.t. phone home'",
|
|
523
|
-
... "you talkin' to me?",
|
|
524
|
-
... "to infinity,and BEYOND!",
|
|
525
|
-
... ]
|
|
526
|
-
... }
|
|
527
|
-
... )
|
|
528
|
-
>>> df.with_columns(
|
|
529
|
-
... quotes_title=pl.col("quotes").str.to_titlecase(),
|
|
530
|
-
... )
|
|
531
|
-
shape: (3, 2)
|
|
532
|
-
┌─────────────────────────┬─────────────────────────┐
|
|
533
|
-
│ quotes ┆ quotes_title │
|
|
534
|
-
│ --- ┆ --- │
|
|
535
|
-
│ str ┆ str │
|
|
536
|
-
╞═════════════════════════╪═════════════════════════╡
|
|
537
|
-
│ 'e.t. phone home' ┆ 'E.T. Phone Home' │
|
|
538
|
-
│ you talkin' to me? ┆ You Talkin' To Me? │
|
|
539
|
-
│ to infinity,and BEYOND! ┆ To Infinity,And Beyond! │
|
|
540
|
-
└─────────────────────────┴─────────────────────────┘
|
|
541
|
-
"""
|
|
542
|
-
return wrap_expr(self._pyexpr.str_to_titlecase())
|
|
543
|
-
|
|
544
|
-
def strip_chars(self, characters: IntoExpr = None) -> Expr:
|
|
545
|
-
r"""
|
|
546
|
-
Remove leading and trailing characters.
|
|
547
|
-
|
|
548
|
-
Parameters
|
|
549
|
-
----------
|
|
550
|
-
characters
|
|
551
|
-
The set of characters to be removed. All combinations of this set of
|
|
552
|
-
characters will be stripped from the start and end of the string. If set to
|
|
553
|
-
None (default), all leading and trailing whitespace is removed instead.
|
|
554
|
-
|
|
555
|
-
Examples
|
|
556
|
-
--------
|
|
557
|
-
>>> df = pl.DataFrame({"foo": [" hello", "\nworld"]})
|
|
558
|
-
>>> df
|
|
559
|
-
shape: (2, 1)
|
|
560
|
-
┌────────┐
|
|
561
|
-
│ foo │
|
|
562
|
-
│ --- │
|
|
563
|
-
│ str │
|
|
564
|
-
╞════════╡
|
|
565
|
-
│ hello │
|
|
566
|
-
│ │
|
|
567
|
-
│ world │
|
|
568
|
-
└────────┘
|
|
569
|
-
|
|
570
|
-
>>> df.with_columns(foo_stripped=pl.col("foo").str.strip_chars())
|
|
571
|
-
shape: (2, 2)
|
|
572
|
-
┌────────┬──────────────┐
|
|
573
|
-
│ foo ┆ foo_stripped │
|
|
574
|
-
│ --- ┆ --- │
|
|
575
|
-
│ str ┆ str │
|
|
576
|
-
╞════════╪══════════════╡
|
|
577
|
-
│ hello ┆ hello │
|
|
578
|
-
│ ┆ world │
|
|
579
|
-
│ world ┆ │
|
|
580
|
-
└────────┴──────────────┘
|
|
581
|
-
|
|
582
|
-
Characters can be stripped by passing a string as argument. Note that whitespace
|
|
583
|
-
will not be stripped automatically when doing so, unless that whitespace is
|
|
584
|
-
also included in the string.
|
|
585
|
-
|
|
586
|
-
>>> df.with_columns(foo_stripped=pl.col("foo").str.strip_chars("ow\n"))
|
|
587
|
-
shape: (2, 2)
|
|
588
|
-
┌────────┬──────────────┐
|
|
589
|
-
│ foo ┆ foo_stripped │
|
|
590
|
-
│ --- ┆ --- │
|
|
591
|
-
│ str ┆ str │
|
|
592
|
-
╞════════╪══════════════╡
|
|
593
|
-
│ hello ┆ hell │
|
|
594
|
-
│ ┆ rld │
|
|
595
|
-
│ world ┆ │
|
|
596
|
-
└────────┴──────────────┘
|
|
597
|
-
"""
|
|
598
|
-
characters_pyexpr = parse_into_expression(characters, str_as_lit=True)
|
|
599
|
-
return wrap_expr(self._pyexpr.str_strip_chars(characters_pyexpr))
|
|
600
|
-
|
|
601
|
-
def strip_chars_start(self, characters: IntoExpr = None) -> Expr:
|
|
602
|
-
r"""
|
|
603
|
-
Remove leading characters.
|
|
604
|
-
|
|
605
|
-
.. note::
|
|
606
|
-
This method strips any characters present in `characters` from the
|
|
607
|
-
start of the input, no matter their order. To strip a prefix (i.e.
|
|
608
|
-
a "word" of characters in a certain order), use
|
|
609
|
-
:func:`strip_prefix` instead.
|
|
610
|
-
|
|
611
|
-
Parameters
|
|
612
|
-
----------
|
|
613
|
-
characters
|
|
614
|
-
The set of characters to be removed. All combinations of this set of
|
|
615
|
-
characters will be stripped from the start of the string. If set to None
|
|
616
|
-
(default), all leading whitespace is removed instead.
|
|
617
|
-
|
|
618
|
-
See Also
|
|
619
|
-
--------
|
|
620
|
-
strip_prefix
|
|
621
|
-
strip_chars_end
|
|
622
|
-
|
|
623
|
-
Examples
|
|
624
|
-
--------
|
|
625
|
-
>>> df = pl.DataFrame({"foo": [" hello ", "\tworld"]})
|
|
626
|
-
>>> df.with_columns(foo_strip_start=pl.col("foo").str.strip_chars_start())
|
|
627
|
-
shape: (2, 2)
|
|
628
|
-
┌─────────┬─────────────────┐
|
|
629
|
-
│ foo ┆ foo_strip_start │
|
|
630
|
-
│ --- ┆ --- │
|
|
631
|
-
│ str ┆ str │
|
|
632
|
-
╞═════════╪═════════════════╡
|
|
633
|
-
│ hello ┆ hello │
|
|
634
|
-
│ world ┆ world │
|
|
635
|
-
└─────────┴─────────────────┘
|
|
636
|
-
|
|
637
|
-
Characters can be stripped by passing a string as argument. Note that whitespace
|
|
638
|
-
will not be stripped automatically when doing so.
|
|
639
|
-
|
|
640
|
-
>>> df.with_columns(
|
|
641
|
-
... foo_strip_start=pl.col("foo").str.strip_chars_start("wod\t"),
|
|
642
|
-
... )
|
|
643
|
-
shape: (2, 2)
|
|
644
|
-
┌─────────┬─────────────────┐
|
|
645
|
-
│ foo ┆ foo_strip_start │
|
|
646
|
-
│ --- ┆ --- │
|
|
647
|
-
│ str ┆ str │
|
|
648
|
-
╞═════════╪═════════════════╡
|
|
649
|
-
│ hello ┆ hello │
|
|
650
|
-
│ world ┆ rld │
|
|
651
|
-
└─────────┴─────────────────┘
|
|
652
|
-
|
|
653
|
-
The order of the provided characters does not matter, they behave like a set.
|
|
654
|
-
|
|
655
|
-
>>> pl.DataFrame({"foo": ["aabcdef"]}).with_columns(
|
|
656
|
-
... foo_strip_start=pl.col("foo").str.strip_chars_start("cba")
|
|
657
|
-
... )
|
|
658
|
-
shape: (1, 2)
|
|
659
|
-
┌─────────┬─────────────────┐
|
|
660
|
-
│ foo ┆ foo_strip_start │
|
|
661
|
-
│ --- ┆ --- │
|
|
662
|
-
│ str ┆ str │
|
|
663
|
-
╞═════════╪═════════════════╡
|
|
664
|
-
│ aabcdef ┆ def │
|
|
665
|
-
└─────────┴─────────────────┘
|
|
666
|
-
"""
|
|
667
|
-
characters_pyexpr = parse_into_expression(characters, str_as_lit=True)
|
|
668
|
-
return wrap_expr(self._pyexpr.str_strip_chars_start(characters_pyexpr))
|
|
669
|
-
|
|
670
|
-
def strip_chars_end(self, characters: IntoExpr = None) -> Expr:
|
|
671
|
-
r"""
|
|
672
|
-
Remove trailing characters.
|
|
673
|
-
|
|
674
|
-
.. note::
|
|
675
|
-
This method strips any characters present in `characters` from the
|
|
676
|
-
end of the input, no matter their order. To strip a suffix (i.e.
|
|
677
|
-
a "word" of characters in a certain order), use
|
|
678
|
-
:func:`strip_suffix` instead.
|
|
679
|
-
|
|
680
|
-
Parameters
|
|
681
|
-
----------
|
|
682
|
-
characters
|
|
683
|
-
The set of characters to be removed. All combinations of this set of
|
|
684
|
-
characters will be stripped from the end of the string. If set to None
|
|
685
|
-
(default), all trailing whitespace is removed instead.
|
|
686
|
-
|
|
687
|
-
See Also
|
|
688
|
-
--------
|
|
689
|
-
strip_suffix
|
|
690
|
-
strip_chars_start
|
|
691
|
-
|
|
692
|
-
Examples
|
|
693
|
-
--------
|
|
694
|
-
>>> df = pl.DataFrame({"foo": [" hello", "world\n"]})
|
|
695
|
-
>>> df
|
|
696
|
-
shape: (2, 1)
|
|
697
|
-
┌────────┐
|
|
698
|
-
│ foo │
|
|
699
|
-
│ --- │
|
|
700
|
-
│ str │
|
|
701
|
-
╞════════╡
|
|
702
|
-
│ hello │
|
|
703
|
-
│ world │
|
|
704
|
-
│ │
|
|
705
|
-
└────────┘
|
|
706
|
-
>>> df.with_columns(foo_strip_end=pl.col("foo").str.strip_chars_end())
|
|
707
|
-
shape: (2, 2)
|
|
708
|
-
┌────────┬───────────────┐
|
|
709
|
-
│ foo ┆ foo_strip_end │
|
|
710
|
-
│ --- ┆ --- │
|
|
711
|
-
│ str ┆ str │
|
|
712
|
-
╞════════╪═══════════════╡
|
|
713
|
-
│ hello ┆ hello │
|
|
714
|
-
│ world ┆ world │
|
|
715
|
-
│ ┆ │
|
|
716
|
-
└────────┴───────────────┘
|
|
717
|
-
|
|
718
|
-
Characters can be stripped by passing a string as argument. Note that whitespace
|
|
719
|
-
will not be stripped automatically when doing so, unless that whitespace is
|
|
720
|
-
also included in the string.
|
|
721
|
-
|
|
722
|
-
>>> df.with_columns(foo_strip_end=pl.col("foo").str.strip_chars_end("oldw "))
|
|
723
|
-
shape: (2, 2)
|
|
724
|
-
┌────────┬───────────────┐
|
|
725
|
-
│ foo ┆ foo_strip_end │
|
|
726
|
-
│ --- ┆ --- │
|
|
727
|
-
│ str ┆ str │
|
|
728
|
-
╞════════╪═══════════════╡
|
|
729
|
-
│ hello ┆ he │
|
|
730
|
-
│ world ┆ world │
|
|
731
|
-
│ ┆ │
|
|
732
|
-
└────────┴───────────────┘
|
|
733
|
-
|
|
734
|
-
The order of the provided characters does not matter, they behave like a set.
|
|
735
|
-
|
|
736
|
-
>>> pl.DataFrame({"foo": ["abcdeff"]}).with_columns(
|
|
737
|
-
... foo_strip_end=pl.col("foo").str.strip_chars_end("fed")
|
|
738
|
-
... )
|
|
739
|
-
shape: (1, 2)
|
|
740
|
-
┌─────────┬───────────────┐
|
|
741
|
-
│ foo ┆ foo_strip_end │
|
|
742
|
-
│ --- ┆ --- │
|
|
743
|
-
│ str ┆ str │
|
|
744
|
-
╞═════════╪═══════════════╡
|
|
745
|
-
│ abcdeff ┆ abc │
|
|
746
|
-
└─────────┴───────────────┘
|
|
747
|
-
"""
|
|
748
|
-
characters_pyexpr = parse_into_expression(characters, str_as_lit=True)
|
|
749
|
-
return wrap_expr(self._pyexpr.str_strip_chars_end(characters_pyexpr))
|
|
750
|
-
|
|
751
|
-
def strip_prefix(self, prefix: IntoExpr) -> Expr:
|
|
752
|
-
"""
|
|
753
|
-
Remove prefix.
|
|
754
|
-
|
|
755
|
-
The prefix will be removed from the string exactly once, if found.
|
|
756
|
-
|
|
757
|
-
.. note::
|
|
758
|
-
This method strips the exact character sequence provided in
|
|
759
|
-
`prefix` from the start of the input. To strip a set of characters
|
|
760
|
-
in any order, use :func:`strip_chars_start` instead.
|
|
761
|
-
|
|
762
|
-
Parameters
|
|
763
|
-
----------
|
|
764
|
-
prefix
|
|
765
|
-
The prefix to be removed.
|
|
766
|
-
|
|
767
|
-
See Also
|
|
768
|
-
--------
|
|
769
|
-
strip_chars_start
|
|
770
|
-
strip_suffix
|
|
771
|
-
|
|
772
|
-
Examples
|
|
773
|
-
--------
|
|
774
|
-
>>> df = pl.DataFrame({"a": ["foobar", "foofoobar", "foo", "bar"]})
|
|
775
|
-
>>> df.with_columns(pl.col("a").str.strip_prefix("foo").alias("stripped"))
|
|
776
|
-
shape: (4, 2)
|
|
777
|
-
┌───────────┬──────────┐
|
|
778
|
-
│ a ┆ stripped │
|
|
779
|
-
│ --- ┆ --- │
|
|
780
|
-
│ str ┆ str │
|
|
781
|
-
╞═══════════╪══════════╡
|
|
782
|
-
│ foobar ┆ bar │
|
|
783
|
-
│ foofoobar ┆ foobar │
|
|
784
|
-
│ foo ┆ │
|
|
785
|
-
│ bar ┆ bar │
|
|
786
|
-
└───────────┴──────────┘
|
|
787
|
-
"""
|
|
788
|
-
prefix_pyexpr = parse_into_expression(prefix, str_as_lit=True)
|
|
789
|
-
return wrap_expr(self._pyexpr.str_strip_prefix(prefix_pyexpr))
|
|
790
|
-
|
|
791
|
-
def strip_suffix(self, suffix: IntoExpr) -> Expr:
|
|
792
|
-
"""
|
|
793
|
-
Remove suffix.
|
|
794
|
-
|
|
795
|
-
The suffix will be removed from the string exactly once, if found.
|
|
796
|
-
|
|
797
|
-
.. note::
|
|
798
|
-
This method strips the exact character sequence provided in
|
|
799
|
-
`suffix` from the end of the input. To strip a set of characters
|
|
800
|
-
in any order, use :func:`strip_chars_end` instead.
|
|
801
|
-
|
|
802
|
-
Parameters
|
|
803
|
-
----------
|
|
804
|
-
suffix
|
|
805
|
-
The suffix to be removed.
|
|
806
|
-
|
|
807
|
-
See Also
|
|
808
|
-
--------
|
|
809
|
-
strip_chars_end
|
|
810
|
-
strip_prefix
|
|
811
|
-
|
|
812
|
-
Examples
|
|
813
|
-
--------
|
|
814
|
-
>>> df = pl.DataFrame({"a": ["foobar", "foobarbar", "foo", "bar"]})
|
|
815
|
-
>>> df.with_columns(pl.col("a").str.strip_suffix("bar").alias("stripped"))
|
|
816
|
-
shape: (4, 2)
|
|
817
|
-
┌───────────┬──────────┐
|
|
818
|
-
│ a ┆ stripped │
|
|
819
|
-
│ --- ┆ --- │
|
|
820
|
-
│ str ┆ str │
|
|
821
|
-
╞═══════════╪══════════╡
|
|
822
|
-
│ foobar ┆ foo │
|
|
823
|
-
│ foobarbar ┆ foobar │
|
|
824
|
-
│ foo ┆ foo │
|
|
825
|
-
│ bar ┆ │
|
|
826
|
-
└───────────┴──────────┘
|
|
827
|
-
"""
|
|
828
|
-
suffix_pyexpr = parse_into_expression(suffix, str_as_lit=True)
|
|
829
|
-
return wrap_expr(self._pyexpr.str_strip_suffix(suffix_pyexpr))
|
|
830
|
-
|
|
831
|
-
def pad_start(self, length: int | IntoExprColumn, fill_char: str = " ") -> Expr:
|
|
832
|
-
"""
|
|
833
|
-
Pad the start of the string until it reaches the given length.
|
|
834
|
-
|
|
835
|
-
Parameters
|
|
836
|
-
----------
|
|
837
|
-
length
|
|
838
|
-
Pad the string until it reaches this length. Strings with length equal to or
|
|
839
|
-
greater than this value are returned as-is.
|
|
840
|
-
fill_char
|
|
841
|
-
The character to pad the string with.
|
|
842
|
-
|
|
843
|
-
See Also
|
|
844
|
-
--------
|
|
845
|
-
pad_end
|
|
846
|
-
zfill
|
|
847
|
-
|
|
848
|
-
Examples
|
|
849
|
-
--------
|
|
850
|
-
>>> df = pl.DataFrame({"a": ["cow", "monkey", "hippopotamus", None]})
|
|
851
|
-
>>> df.with_columns(padded=pl.col("a").str.pad_start(8, "*"))
|
|
852
|
-
shape: (4, 2)
|
|
853
|
-
┌──────────────┬──────────────┐
|
|
854
|
-
│ a ┆ padded │
|
|
855
|
-
│ --- ┆ --- │
|
|
856
|
-
│ str ┆ str │
|
|
857
|
-
╞══════════════╪══════════════╡
|
|
858
|
-
│ cow ┆ *****cow │
|
|
859
|
-
│ monkey ┆ **monkey │
|
|
860
|
-
│ hippopotamus ┆ hippopotamus │
|
|
861
|
-
│ null ┆ null │
|
|
862
|
-
└──────────────┴──────────────┘
|
|
863
|
-
"""
|
|
864
|
-
length_pyexpr = parse_into_expression(length)
|
|
865
|
-
if not isinstance(fill_char, str):
|
|
866
|
-
msg = f'"pad_start" expects a `str`, given a {qualified_type_name(fill_char)!r}'
|
|
867
|
-
raise TypeError(msg)
|
|
868
|
-
return wrap_expr(self._pyexpr.str_pad_start(length_pyexpr, fill_char))
|
|
869
|
-
|
|
870
|
-
def pad_end(self, length: int | IntoExprColumn, fill_char: str = " ") -> Expr:
|
|
871
|
-
"""
|
|
872
|
-
Pad the end of the string until it reaches the given length.
|
|
873
|
-
|
|
874
|
-
Parameters
|
|
875
|
-
----------
|
|
876
|
-
length
|
|
877
|
-
Pad the string until it reaches this length. Strings with length equal to or
|
|
878
|
-
greater than this value are returned as-is. Can be int or expression.
|
|
879
|
-
fill_char
|
|
880
|
-
The character to pad the string with.
|
|
881
|
-
|
|
882
|
-
See Also
|
|
883
|
-
--------
|
|
884
|
-
pad_start
|
|
885
|
-
|
|
886
|
-
Examples
|
|
887
|
-
--------
|
|
888
|
-
>>> df = pl.DataFrame({"a": ["cow", "monkey", "hippopotamus", None]})
|
|
889
|
-
>>> df.with_columns(padded=pl.col("a").str.pad_end(8, "*"))
|
|
890
|
-
shape: (4, 2)
|
|
891
|
-
┌──────────────┬──────────────┐
|
|
892
|
-
│ a ┆ padded │
|
|
893
|
-
│ --- ┆ --- │
|
|
894
|
-
│ str ┆ str │
|
|
895
|
-
╞══════════════╪══════════════╡
|
|
896
|
-
│ cow ┆ cow***** │
|
|
897
|
-
│ monkey ┆ monkey** │
|
|
898
|
-
│ hippopotamus ┆ hippopotamus │
|
|
899
|
-
│ null ┆ null │
|
|
900
|
-
└──────────────┴──────────────┘
|
|
901
|
-
"""
|
|
902
|
-
length_pyexpr = parse_into_expression(length)
|
|
903
|
-
if not isinstance(fill_char, str):
|
|
904
|
-
msg = (
|
|
905
|
-
f'"pad_end" expects a `str`, given a {qualified_type_name(fill_char)!r}'
|
|
906
|
-
)
|
|
907
|
-
raise TypeError(msg)
|
|
908
|
-
return wrap_expr(self._pyexpr.str_pad_end(length_pyexpr, fill_char))
|
|
909
|
-
|
|
910
|
-
def zfill(self, length: int | IntoExprColumn) -> Expr:
|
|
911
|
-
"""
|
|
912
|
-
Pad the start of the string with zeros until it reaches the given length.
|
|
913
|
-
|
|
914
|
-
A sign prefix (`-`) is handled by inserting the padding after the sign
|
|
915
|
-
character rather than before.
|
|
916
|
-
|
|
917
|
-
Parameters
|
|
918
|
-
----------
|
|
919
|
-
length
|
|
920
|
-
Pad the string until it reaches this length. Strings with length equal to
|
|
921
|
-
or greater than this value are returned as-is.
|
|
922
|
-
|
|
923
|
-
See Also
|
|
924
|
-
--------
|
|
925
|
-
pad_start
|
|
926
|
-
|
|
927
|
-
Notes
|
|
928
|
-
-----
|
|
929
|
-
This method is intended for padding numeric strings. If your data contains
|
|
930
|
-
non-ASCII characters, use :func:`pad_start` instead.
|
|
931
|
-
|
|
932
|
-
Examples
|
|
933
|
-
--------
|
|
934
|
-
>>> df = pl.DataFrame({"a": [-1, 123, 999999, None]})
|
|
935
|
-
>>> df.with_columns(zfill=pl.col("a").cast(pl.String).str.zfill(4))
|
|
936
|
-
shape: (4, 2)
|
|
937
|
-
┌────────┬────────┐
|
|
938
|
-
│ a ┆ zfill │
|
|
939
|
-
│ --- ┆ --- │
|
|
940
|
-
│ i64 ┆ str │
|
|
941
|
-
╞════════╪════════╡
|
|
942
|
-
│ -1 ┆ -001 │
|
|
943
|
-
│ 123 ┆ 0123 │
|
|
944
|
-
│ 999999 ┆ 999999 │
|
|
945
|
-
│ null ┆ null │
|
|
946
|
-
└────────┴────────┘
|
|
947
|
-
>>> df = pl.DataFrame(
|
|
948
|
-
... {
|
|
949
|
-
... "a": [-1, 123, 999999, None],
|
|
950
|
-
... "length": [8, 4, 1, 2],
|
|
951
|
-
... }
|
|
952
|
-
... )
|
|
953
|
-
>>> df.with_columns(zfill=pl.col("a").cast(pl.String).str.zfill("length"))
|
|
954
|
-
shape: (4, 3)
|
|
955
|
-
┌────────┬────────┬──────────┐
|
|
956
|
-
│ a ┆ length ┆ zfill │
|
|
957
|
-
│ --- ┆ --- ┆ --- │
|
|
958
|
-
│ i64 ┆ i64 ┆ str │
|
|
959
|
-
╞════════╪════════╪══════════╡
|
|
960
|
-
│ -1 ┆ 8 ┆ -0000001 │
|
|
961
|
-
│ 123 ┆ 4 ┆ 0123 │
|
|
962
|
-
│ 999999 ┆ 1 ┆ 999999 │
|
|
963
|
-
│ null ┆ 2 ┆ null │
|
|
964
|
-
└────────┴────────┴──────────┘
|
|
965
|
-
"""
|
|
966
|
-
length_pyexpr = parse_into_expression(length)
|
|
967
|
-
return wrap_expr(self._pyexpr.str_zfill(length_pyexpr))
|
|
968
|
-
|
|
969
|
-
def contains(
|
|
970
|
-
self, pattern: str | Expr, *, literal: bool = False, strict: bool = True
|
|
971
|
-
) -> Expr:
|
|
972
|
-
"""
|
|
973
|
-
Check if the string contains a substring that matches a pattern.
|
|
974
|
-
|
|
975
|
-
Parameters
|
|
976
|
-
----------
|
|
977
|
-
pattern
|
|
978
|
-
A valid regular expression pattern, compatible with the `regex crate
|
|
979
|
-
<https://docs.rs/regex/latest/regex/>`_.
|
|
980
|
-
literal
|
|
981
|
-
Treat `pattern` as a literal string, not as a regular expression.
|
|
982
|
-
strict
|
|
983
|
-
Raise an error if the underlying pattern is not a valid regex,
|
|
984
|
-
otherwise mask out with a null value.
|
|
985
|
-
|
|
986
|
-
Notes
|
|
987
|
-
-----
|
|
988
|
-
To modify regular expression behaviour (such as case-sensitivity) with
|
|
989
|
-
flags, use the inline `(?iLmsuxU)` syntax. For example:
|
|
990
|
-
|
|
991
|
-
>>> pl.DataFrame({"s": ["AAA", "aAa", "aaa"]}).with_columns(
|
|
992
|
-
... default_match=pl.col("s").str.contains("AA"),
|
|
993
|
-
... insensitive_match=pl.col("s").str.contains("(?i)AA"),
|
|
994
|
-
... )
|
|
995
|
-
shape: (3, 3)
|
|
996
|
-
┌─────┬───────────────┬───────────────────┐
|
|
997
|
-
│ s ┆ default_match ┆ insensitive_match │
|
|
998
|
-
│ --- ┆ --- ┆ --- │
|
|
999
|
-
│ str ┆ bool ┆ bool │
|
|
1000
|
-
╞═════╪═══════════════╪═══════════════════╡
|
|
1001
|
-
│ AAA ┆ true ┆ true │
|
|
1002
|
-
│ aAa ┆ false ┆ true │
|
|
1003
|
-
│ aaa ┆ false ┆ true │
|
|
1004
|
-
└─────┴───────────────┴───────────────────┘
|
|
1005
|
-
|
|
1006
|
-
See the regex crate's section on `grouping and flags
|
|
1007
|
-
<https://docs.rs/regex/latest/regex/#grouping-and-flags>`_ for
|
|
1008
|
-
additional information about the use of inline expression modifiers.
|
|
1009
|
-
|
|
1010
|
-
See Also
|
|
1011
|
-
--------
|
|
1012
|
-
starts_with : Check if string values start with a substring.
|
|
1013
|
-
ends_with : Check if string values end with a substring.
|
|
1014
|
-
find: Return the index of the first substring matching a pattern.
|
|
1015
|
-
|
|
1016
|
-
Examples
|
|
1017
|
-
--------
|
|
1018
|
-
>>> df = pl.DataFrame({"txt": ["Crab", "cat and dog", "rab$bit", None]})
|
|
1019
|
-
>>> df.select(
|
|
1020
|
-
... pl.col("txt"),
|
|
1021
|
-
... pl.col("txt").str.contains("cat|bit").alias("regex"),
|
|
1022
|
-
... pl.col("txt").str.contains("rab$", literal=True).alias("literal"),
|
|
1023
|
-
... )
|
|
1024
|
-
shape: (4, 3)
|
|
1025
|
-
┌─────────────┬───────┬─────────┐
|
|
1026
|
-
│ txt ┆ regex ┆ literal │
|
|
1027
|
-
│ --- ┆ --- ┆ --- │
|
|
1028
|
-
│ str ┆ bool ┆ bool │
|
|
1029
|
-
╞═════════════╪═══════╪═════════╡
|
|
1030
|
-
│ Crab ┆ false ┆ false │
|
|
1031
|
-
│ cat and dog ┆ true ┆ false │
|
|
1032
|
-
│ rab$bit ┆ true ┆ true │
|
|
1033
|
-
│ null ┆ null ┆ null │
|
|
1034
|
-
└─────────────┴───────┴─────────┘
|
|
1035
|
-
"""
|
|
1036
|
-
pattern_pyexpr = parse_into_expression(pattern, str_as_lit=True)
|
|
1037
|
-
return wrap_expr(self._pyexpr.str_contains(pattern_pyexpr, literal, strict))
|
|
1038
|
-
|
|
1039
|
-
def find(
|
|
1040
|
-
self, pattern: str | Expr, *, literal: bool = False, strict: bool = True
|
|
1041
|
-
) -> Expr:
|
|
1042
|
-
"""
|
|
1043
|
-
Return the bytes offset of the first substring matching a pattern.
|
|
1044
|
-
|
|
1045
|
-
If the pattern is not found, returns None.
|
|
1046
|
-
|
|
1047
|
-
Parameters
|
|
1048
|
-
----------
|
|
1049
|
-
pattern
|
|
1050
|
-
A valid regular expression pattern, compatible with the `regex crate
|
|
1051
|
-
<https://docs.rs/regex/latest/regex/>`_.
|
|
1052
|
-
literal
|
|
1053
|
-
Treat `pattern` as a literal string, not as a regular expression.
|
|
1054
|
-
strict
|
|
1055
|
-
Raise an error if the underlying pattern is not a valid regex,
|
|
1056
|
-
otherwise mask out with a null value.
|
|
1057
|
-
|
|
1058
|
-
Notes
|
|
1059
|
-
-----
|
|
1060
|
-
To modify regular expression behaviour (such as case-sensitivity) with
|
|
1061
|
-
flags, use the inline `(?iLmsuxU)` syntax. For example:
|
|
1062
|
-
|
|
1063
|
-
>>> pl.DataFrame({"s": ["AAA", "aAa", "aaa"]}).with_columns(
|
|
1064
|
-
... default_match=pl.col("s").str.find("Aa"),
|
|
1065
|
-
... insensitive_match=pl.col("s").str.find("(?i)Aa"),
|
|
1066
|
-
... )
|
|
1067
|
-
shape: (3, 3)
|
|
1068
|
-
┌─────┬───────────────┬───────────────────┐
|
|
1069
|
-
│ s ┆ default_match ┆ insensitive_match │
|
|
1070
|
-
│ --- ┆ --- ┆ --- │
|
|
1071
|
-
│ str ┆ u32 ┆ u32 │
|
|
1072
|
-
╞═════╪═══════════════╪═══════════════════╡
|
|
1073
|
-
│ AAA ┆ null ┆ 0 │
|
|
1074
|
-
│ aAa ┆ 1 ┆ 0 │
|
|
1075
|
-
│ aaa ┆ null ┆ 0 │
|
|
1076
|
-
└─────┴───────────────┴───────────────────┘
|
|
1077
|
-
|
|
1078
|
-
See the regex crate's section on `grouping and flags
|
|
1079
|
-
<https://docs.rs/regex/latest/regex/#grouping-and-flags>`_ for
|
|
1080
|
-
additional information about the use of inline expression modifiers.
|
|
1081
|
-
|
|
1082
|
-
See Also
|
|
1083
|
-
--------
|
|
1084
|
-
contains : Check if the string contains a substring that matches a pattern.
|
|
1085
|
-
|
|
1086
|
-
Examples
|
|
1087
|
-
--------
|
|
1088
|
-
>>> df = pl.DataFrame(
|
|
1089
|
-
... {
|
|
1090
|
-
... "txt": ["Crab", "Lobster", None, "Crustacean"],
|
|
1091
|
-
... "pat": ["a[bc]", "b.t", "[aeiuo]", "(?i)A[BC]"],
|
|
1092
|
-
... }
|
|
1093
|
-
... )
|
|
1094
|
-
|
|
1095
|
-
Find the index of the first substring matching a regex or literal pattern:
|
|
1096
|
-
|
|
1097
|
-
>>> df.select(
|
|
1098
|
-
... pl.col("txt"),
|
|
1099
|
-
... pl.col("txt").str.find("a|e").alias("a|e (regex)"),
|
|
1100
|
-
... pl.col("txt").str.find("e", literal=True).alias("e (lit)"),
|
|
1101
|
-
... )
|
|
1102
|
-
shape: (4, 3)
|
|
1103
|
-
┌────────────┬─────────────┬─────────┐
|
|
1104
|
-
│ txt ┆ a|e (regex) ┆ e (lit) │
|
|
1105
|
-
│ --- ┆ --- ┆ --- │
|
|
1106
|
-
│ str ┆ u32 ┆ u32 │
|
|
1107
|
-
╞════════════╪═════════════╪═════════╡
|
|
1108
|
-
│ Crab ┆ 2 ┆ null │
|
|
1109
|
-
│ Lobster ┆ 5 ┆ 5 │
|
|
1110
|
-
│ null ┆ null ┆ null │
|
|
1111
|
-
│ Crustacean ┆ 5 ┆ 7 │
|
|
1112
|
-
└────────────┴─────────────┴─────────┘
|
|
1113
|
-
|
|
1114
|
-
Match against a pattern found in another column or (expression):
|
|
1115
|
-
|
|
1116
|
-
>>> df.with_columns(pl.col("txt").str.find(pl.col("pat")).alias("find_pat"))
|
|
1117
|
-
shape: (4, 3)
|
|
1118
|
-
┌────────────┬───────────┬──────────┐
|
|
1119
|
-
│ txt ┆ pat ┆ find_pat │
|
|
1120
|
-
│ --- ┆ --- ┆ --- │
|
|
1121
|
-
│ str ┆ str ┆ u32 │
|
|
1122
|
-
╞════════════╪═══════════╪══════════╡
|
|
1123
|
-
│ Crab ┆ a[bc] ┆ 2 │
|
|
1124
|
-
│ Lobster ┆ b.t ┆ 2 │
|
|
1125
|
-
│ null ┆ [aeiuo] ┆ null │
|
|
1126
|
-
│ Crustacean ┆ (?i)A[BC] ┆ 5 │
|
|
1127
|
-
└────────────┴───────────┴──────────┘
|
|
1128
|
-
"""
|
|
1129
|
-
pattern_pyexpr = parse_into_expression(pattern, str_as_lit=True)
|
|
1130
|
-
return wrap_expr(self._pyexpr.str_find(pattern_pyexpr, literal, strict))
|
|
1131
|
-
|
|
1132
|
-
def ends_with(self, suffix: str | Expr) -> Expr:
|
|
1133
|
-
"""
|
|
1134
|
-
Check if string values end with a substring.
|
|
1135
|
-
|
|
1136
|
-
Parameters
|
|
1137
|
-
----------
|
|
1138
|
-
suffix
|
|
1139
|
-
Suffix substring.
|
|
1140
|
-
|
|
1141
|
-
See Also
|
|
1142
|
-
--------
|
|
1143
|
-
contains : Check if the string contains a substring that matches a pattern.
|
|
1144
|
-
starts_with : Check if string values start with a substring.
|
|
1145
|
-
|
|
1146
|
-
Examples
|
|
1147
|
-
--------
|
|
1148
|
-
>>> df = pl.DataFrame({"fruits": ["apple", "mango", None]})
|
|
1149
|
-
>>> df.with_columns(
|
|
1150
|
-
... pl.col("fruits").str.ends_with("go").alias("has_suffix"),
|
|
1151
|
-
... )
|
|
1152
|
-
shape: (3, 2)
|
|
1153
|
-
┌────────┬────────────┐
|
|
1154
|
-
│ fruits ┆ has_suffix │
|
|
1155
|
-
│ --- ┆ --- │
|
|
1156
|
-
│ str ┆ bool │
|
|
1157
|
-
╞════════╪════════════╡
|
|
1158
|
-
│ apple ┆ false │
|
|
1159
|
-
│ mango ┆ true │
|
|
1160
|
-
│ null ┆ null │
|
|
1161
|
-
└────────┴────────────┘
|
|
1162
|
-
|
|
1163
|
-
>>> df = pl.DataFrame(
|
|
1164
|
-
... {"fruits": ["apple", "mango", "banana"], "suffix": ["le", "go", "nu"]}
|
|
1165
|
-
... )
|
|
1166
|
-
>>> df.with_columns(
|
|
1167
|
-
... pl.col("fruits").str.ends_with(pl.col("suffix")).alias("has_suffix"),
|
|
1168
|
-
... )
|
|
1169
|
-
shape: (3, 3)
|
|
1170
|
-
┌────────┬────────┬────────────┐
|
|
1171
|
-
│ fruits ┆ suffix ┆ has_suffix │
|
|
1172
|
-
│ --- ┆ --- ┆ --- │
|
|
1173
|
-
│ str ┆ str ┆ bool │
|
|
1174
|
-
╞════════╪════════╪════════════╡
|
|
1175
|
-
│ apple ┆ le ┆ true │
|
|
1176
|
-
│ mango ┆ go ┆ true │
|
|
1177
|
-
│ banana ┆ nu ┆ false │
|
|
1178
|
-
└────────┴────────┴────────────┘
|
|
1179
|
-
|
|
1180
|
-
Using `ends_with` as a filter condition:
|
|
1181
|
-
|
|
1182
|
-
>>> df.filter(pl.col("fruits").str.ends_with("go"))
|
|
1183
|
-
shape: (1, 2)
|
|
1184
|
-
┌────────┬────────┐
|
|
1185
|
-
│ fruits ┆ suffix │
|
|
1186
|
-
│ --- ┆ --- │
|
|
1187
|
-
│ str ┆ str │
|
|
1188
|
-
╞════════╪════════╡
|
|
1189
|
-
│ mango ┆ go │
|
|
1190
|
-
└────────┴────────┘
|
|
1191
|
-
"""
|
|
1192
|
-
suffix_pyexpr = parse_into_expression(suffix, str_as_lit=True)
|
|
1193
|
-
return wrap_expr(self._pyexpr.str_ends_with(suffix_pyexpr))
|
|
1194
|
-
|
|
1195
|
-
def starts_with(self, prefix: str | Expr) -> Expr:
|
|
1196
|
-
"""
|
|
1197
|
-
Check if string values start with a substring.
|
|
1198
|
-
|
|
1199
|
-
Parameters
|
|
1200
|
-
----------
|
|
1201
|
-
prefix
|
|
1202
|
-
Prefix substring.
|
|
1203
|
-
|
|
1204
|
-
See Also
|
|
1205
|
-
--------
|
|
1206
|
-
contains : Check if the string contains a substring that matches a pattern.
|
|
1207
|
-
ends_with : Check if string values end with a substring.
|
|
1208
|
-
|
|
1209
|
-
Examples
|
|
1210
|
-
--------
|
|
1211
|
-
>>> df = pl.DataFrame({"fruits": ["apple", "mango", None]})
|
|
1212
|
-
>>> df.with_columns(
|
|
1213
|
-
... pl.col("fruits").str.starts_with("app").alias("has_prefix"),
|
|
1214
|
-
... )
|
|
1215
|
-
shape: (3, 2)
|
|
1216
|
-
┌────────┬────────────┐
|
|
1217
|
-
│ fruits ┆ has_prefix │
|
|
1218
|
-
│ --- ┆ --- │
|
|
1219
|
-
│ str ┆ bool │
|
|
1220
|
-
╞════════╪════════════╡
|
|
1221
|
-
│ apple ┆ true │
|
|
1222
|
-
│ mango ┆ false │
|
|
1223
|
-
│ null ┆ null │
|
|
1224
|
-
└────────┴────────────┘
|
|
1225
|
-
|
|
1226
|
-
>>> df = pl.DataFrame(
|
|
1227
|
-
... {"fruits": ["apple", "mango", "banana"], "prefix": ["app", "na", "ba"]}
|
|
1228
|
-
... )
|
|
1229
|
-
>>> df.with_columns(
|
|
1230
|
-
... pl.col("fruits").str.starts_with(pl.col("prefix")).alias("has_prefix"),
|
|
1231
|
-
... )
|
|
1232
|
-
shape: (3, 3)
|
|
1233
|
-
┌────────┬────────┬────────────┐
|
|
1234
|
-
│ fruits ┆ prefix ┆ has_prefix │
|
|
1235
|
-
│ --- ┆ --- ┆ --- │
|
|
1236
|
-
│ str ┆ str ┆ bool │
|
|
1237
|
-
╞════════╪════════╪════════════╡
|
|
1238
|
-
│ apple ┆ app ┆ true │
|
|
1239
|
-
│ mango ┆ na ┆ false │
|
|
1240
|
-
│ banana ┆ ba ┆ true │
|
|
1241
|
-
└────────┴────────┴────────────┘
|
|
1242
|
-
|
|
1243
|
-
Using `starts_with` as a filter condition:
|
|
1244
|
-
|
|
1245
|
-
>>> df.filter(pl.col("fruits").str.starts_with("app"))
|
|
1246
|
-
shape: (1, 2)
|
|
1247
|
-
┌────────┬────────┐
|
|
1248
|
-
│ fruits ┆ prefix │
|
|
1249
|
-
│ --- ┆ --- │
|
|
1250
|
-
│ str ┆ str │
|
|
1251
|
-
╞════════╪════════╡
|
|
1252
|
-
│ apple ┆ app │
|
|
1253
|
-
└────────┴────────┘
|
|
1254
|
-
"""
|
|
1255
|
-
prefix_pyexpr = parse_into_expression(prefix, str_as_lit=True)
|
|
1256
|
-
return wrap_expr(self._pyexpr.str_starts_with(prefix_pyexpr))
|
|
1257
|
-
|
|
1258
|
-
def json_decode(
|
|
1259
|
-
self,
|
|
1260
|
-
dtype: PolarsDataType | pl.DataTypeExpr,
|
|
1261
|
-
*,
|
|
1262
|
-
infer_schema_length: int | None = None,
|
|
1263
|
-
) -> Expr:
|
|
1264
|
-
"""
|
|
1265
|
-
Parse string values as JSON.
|
|
1266
|
-
|
|
1267
|
-
Throws an error if invalid JSON strings are encountered.
|
|
1268
|
-
|
|
1269
|
-
Parameters
|
|
1270
|
-
----------
|
|
1271
|
-
dtype
|
|
1272
|
-
The dtype to cast the extracted value to.
|
|
1273
|
-
infer_schema_length
|
|
1274
|
-
Deprecated and ignored.
|
|
1275
|
-
|
|
1276
|
-
.. versionchanged: 1.33.0
|
|
1277
|
-
Deprecate `infer_schema_length` and make `dtype` non-optional to
|
|
1278
|
-
ensure that the planner can determine the output datatype.
|
|
1279
|
-
|
|
1280
|
-
See Also
|
|
1281
|
-
--------
|
|
1282
|
-
json_path_match : Extract the first match from a JSON string using the provided
|
|
1283
|
-
JSONPath.
|
|
1284
|
-
|
|
1285
|
-
Examples
|
|
1286
|
-
--------
|
|
1287
|
-
>>> df = pl.DataFrame(
|
|
1288
|
-
... {"json": ['{"a":1, "b": true}', None, '{"a":2, "b": false}']}
|
|
1289
|
-
... )
|
|
1290
|
-
>>> dtype = pl.Struct([pl.Field("a", pl.Int64), pl.Field("b", pl.Boolean)])
|
|
1291
|
-
>>> df.with_columns(decoded=pl.col("json").str.json_decode(dtype))
|
|
1292
|
-
shape: (3, 2)
|
|
1293
|
-
┌─────────────────────┬───────────┐
|
|
1294
|
-
│ json ┆ decoded │
|
|
1295
|
-
│ --- ┆ --- │
|
|
1296
|
-
│ str ┆ struct[2] │
|
|
1297
|
-
╞═════════════════════╪═══════════╡
|
|
1298
|
-
│ {"a":1, "b": true} ┆ {1,true} │
|
|
1299
|
-
│ null ┆ null │
|
|
1300
|
-
│ {"a":2, "b": false} ┆ {2,false} │
|
|
1301
|
-
└─────────────────────┴───────────┘
|
|
1302
|
-
"""
|
|
1303
|
-
if dtype is None:
|
|
1304
|
-
msg = "`Expr.str.json_decode` needs an explicitly given `dtype` otherwise Polars is not able to determine the output type. If you want to eagerly infer datatype you can use `Series.str.json_decode`."
|
|
1305
|
-
raise TypeError(msg)
|
|
1306
|
-
|
|
1307
|
-
if infer_schema_length is not None:
|
|
1308
|
-
issue_warning(
|
|
1309
|
-
"`Expr.str.json_decode` with `infer_schema_length` is deprecated and has no effect on execution.",
|
|
1310
|
-
DeprecationWarning,
|
|
1311
|
-
)
|
|
1312
|
-
|
|
1313
|
-
dtype_expr = parse_into_datatype_expr(dtype)._pydatatype_expr
|
|
1314
|
-
return wrap_expr(self._pyexpr.str_json_decode(dtype_expr))
|
|
1315
|
-
|
|
1316
|
-
def json_path_match(self, json_path: IntoExprColumn) -> Expr:
|
|
1317
|
-
"""
|
|
1318
|
-
Extract the first match from a JSON string using the provided JSONPath.
|
|
1319
|
-
|
|
1320
|
-
Throws errors if invalid JSON strings are encountered. All return values
|
|
1321
|
-
are cast to :class:`String`, regardless of the original value.
|
|
1322
|
-
|
|
1323
|
-
Documentation on the JSONPath standard can be found
|
|
1324
|
-
`here <https://goessner.net/articles/JsonPath/>`_.
|
|
1325
|
-
|
|
1326
|
-
Parameters
|
|
1327
|
-
----------
|
|
1328
|
-
json_path
|
|
1329
|
-
A valid JSONPath query string.
|
|
1330
|
-
|
|
1331
|
-
Returns
|
|
1332
|
-
-------
|
|
1333
|
-
Expr
|
|
1334
|
-
Expression of data type :class:`String`. Contains null values if original
|
|
1335
|
-
value is null or the json_path returns nothing.
|
|
1336
|
-
|
|
1337
|
-
Examples
|
|
1338
|
-
--------
|
|
1339
|
-
>>> df = pl.DataFrame(
|
|
1340
|
-
... {"json_val": ['{"a":"1"}', None, '{"a":2}', '{"a":2.1}', '{"a":true}']}
|
|
1341
|
-
... )
|
|
1342
|
-
>>> df.with_columns(matched=pl.col("json_val").str.json_path_match("$.a"))
|
|
1343
|
-
shape: (5, 2)
|
|
1344
|
-
┌────────────┬─────────┐
|
|
1345
|
-
│ json_val ┆ matched │
|
|
1346
|
-
│ --- ┆ --- │
|
|
1347
|
-
│ str ┆ str │
|
|
1348
|
-
╞════════════╪═════════╡
|
|
1349
|
-
│ {"a":"1"} ┆ 1 │
|
|
1350
|
-
│ null ┆ null │
|
|
1351
|
-
│ {"a":2} ┆ 2 │
|
|
1352
|
-
│ {"a":2.1} ┆ 2.1 │
|
|
1353
|
-
│ {"a":true} ┆ true │
|
|
1354
|
-
└────────────┴─────────┘
|
|
1355
|
-
"""
|
|
1356
|
-
json_path_pyexpr = parse_into_expression(json_path, str_as_lit=True)
|
|
1357
|
-
return wrap_expr(self._pyexpr.str_json_path_match(json_path_pyexpr))
|
|
1358
|
-
|
|
1359
|
-
def decode(self, encoding: TransferEncoding, *, strict: bool = True) -> Expr:
|
|
1360
|
-
r"""
|
|
1361
|
-
Decode values using the provided encoding.
|
|
1362
|
-
|
|
1363
|
-
Parameters
|
|
1364
|
-
----------
|
|
1365
|
-
encoding : {'hex', 'base64'}
|
|
1366
|
-
The encoding to use.
|
|
1367
|
-
strict
|
|
1368
|
-
Raise an error if the underlying value cannot be decoded,
|
|
1369
|
-
otherwise mask out with a null value.
|
|
1370
|
-
|
|
1371
|
-
Returns
|
|
1372
|
-
-------
|
|
1373
|
-
Expr
|
|
1374
|
-
Expression of data type :class:`Binary`.
|
|
1375
|
-
|
|
1376
|
-
Examples
|
|
1377
|
-
--------
|
|
1378
|
-
>>> df = pl.DataFrame({"color": ["000000", "ffff00", "0000ff"]})
|
|
1379
|
-
>>> df.with_columns(pl.col("color").str.decode("hex").alias("decoded"))
|
|
1380
|
-
shape: (3, 2)
|
|
1381
|
-
┌────────┬─────────────────┐
|
|
1382
|
-
│ color ┆ decoded │
|
|
1383
|
-
│ --- ┆ --- │
|
|
1384
|
-
│ str ┆ binary │
|
|
1385
|
-
╞════════╪═════════════════╡
|
|
1386
|
-
│ 000000 ┆ b"\x00\x00\x00" │
|
|
1387
|
-
│ ffff00 ┆ b"\xff\xff\x00" │
|
|
1388
|
-
│ 0000ff ┆ b"\x00\x00\xff" │
|
|
1389
|
-
└────────┴─────────────────┘
|
|
1390
|
-
"""
|
|
1391
|
-
if encoding == "hex":
|
|
1392
|
-
return wrap_expr(self._pyexpr.str_hex_decode(strict))
|
|
1393
|
-
elif encoding == "base64":
|
|
1394
|
-
return wrap_expr(self._pyexpr.str_base64_decode(strict))
|
|
1395
|
-
else:
|
|
1396
|
-
msg = f"`encoding` must be one of {{'hex', 'base64'}}, got {encoding!r}"
|
|
1397
|
-
raise ValueError(msg)
|
|
1398
|
-
|
|
1399
|
-
def encode(self, encoding: TransferEncoding) -> Expr:
|
|
1400
|
-
"""
|
|
1401
|
-
Encode values using the provided encoding.
|
|
1402
|
-
|
|
1403
|
-
Parameters
|
|
1404
|
-
----------
|
|
1405
|
-
encoding : {'hex', 'base64'}
|
|
1406
|
-
The encoding to use.
|
|
1407
|
-
|
|
1408
|
-
Returns
|
|
1409
|
-
-------
|
|
1410
|
-
Expr
|
|
1411
|
-
Expression of data type :class:`String`.
|
|
1412
|
-
|
|
1413
|
-
Examples
|
|
1414
|
-
--------
|
|
1415
|
-
>>> df = pl.DataFrame({"strings": ["foo", "bar", None]})
|
|
1416
|
-
>>> df.with_columns(strings_hex=pl.col("strings").str.encode("hex"))
|
|
1417
|
-
shape: (3, 2)
|
|
1418
|
-
┌─────────┬─────────────┐
|
|
1419
|
-
│ strings ┆ strings_hex │
|
|
1420
|
-
│ --- ┆ --- │
|
|
1421
|
-
│ str ┆ str │
|
|
1422
|
-
╞═════════╪═════════════╡
|
|
1423
|
-
│ foo ┆ 666f6f │
|
|
1424
|
-
│ bar ┆ 626172 │
|
|
1425
|
-
│ null ┆ null │
|
|
1426
|
-
└─────────┴─────────────┘
|
|
1427
|
-
"""
|
|
1428
|
-
if encoding == "hex":
|
|
1429
|
-
return wrap_expr(self._pyexpr.str_hex_encode())
|
|
1430
|
-
elif encoding == "base64":
|
|
1431
|
-
return wrap_expr(self._pyexpr.str_base64_encode())
|
|
1432
|
-
else:
|
|
1433
|
-
msg = f"`encoding` must be one of {{'hex', 'base64'}}, got {encoding!r}"
|
|
1434
|
-
raise ValueError(msg)
|
|
1435
|
-
|
|
1436
|
-
def extract(self, pattern: IntoExprColumn, group_index: int = 1) -> Expr:
|
|
1437
|
-
r"""
|
|
1438
|
-
Extract the target capture group from provided patterns.
|
|
1439
|
-
|
|
1440
|
-
Parameters
|
|
1441
|
-
----------
|
|
1442
|
-
pattern
|
|
1443
|
-
A valid regular expression pattern containing at least one capture group,
|
|
1444
|
-
compatible with the `regex crate <https://docs.rs/regex/latest/regex/>`_.
|
|
1445
|
-
group_index
|
|
1446
|
-
Index of the targeted capture group.
|
|
1447
|
-
Group 0 means the whole pattern, the first group begins at index 1.
|
|
1448
|
-
Defaults to the first capture group.
|
|
1449
|
-
|
|
1450
|
-
Notes
|
|
1451
|
-
-----
|
|
1452
|
-
To modify regular expression behaviour (such as multi-line matching)
|
|
1453
|
-
with flags, use the inline `(?iLmsuxU)` syntax. For example:
|
|
1454
|
-
|
|
1455
|
-
>>> df = pl.DataFrame(
|
|
1456
|
-
... data={
|
|
1457
|
-
... "lines": [
|
|
1458
|
-
... "I Like\nThose\nOdds",
|
|
1459
|
-
... "This is\nThe Way",
|
|
1460
|
-
... ]
|
|
1461
|
-
... }
|
|
1462
|
-
... )
|
|
1463
|
-
>>> df.with_columns(
|
|
1464
|
-
... pl.col("lines").str.extract(r"(?m)^(T\w+)", 1).alias("matches"),
|
|
1465
|
-
... )
|
|
1466
|
-
shape: (2, 2)
|
|
1467
|
-
┌─────────┬─────────┐
|
|
1468
|
-
│ lines ┆ matches │
|
|
1469
|
-
│ --- ┆ --- │
|
|
1470
|
-
│ str ┆ str │
|
|
1471
|
-
╞═════════╪═════════╡
|
|
1472
|
-
│ I Like ┆ Those │
|
|
1473
|
-
│ Those ┆ │
|
|
1474
|
-
│ Odds ┆ │
|
|
1475
|
-
│ This is ┆ This │
|
|
1476
|
-
│ The Way ┆ │
|
|
1477
|
-
└─────────┴─────────┘
|
|
1478
|
-
|
|
1479
|
-
See the regex crate's section on `grouping and flags
|
|
1480
|
-
<https://docs.rs/regex/latest/regex/#grouping-and-flags>`_ for
|
|
1481
|
-
additional information about the use of inline expression modifiers.
|
|
1482
|
-
|
|
1483
|
-
Returns
|
|
1484
|
-
-------
|
|
1485
|
-
Expr
|
|
1486
|
-
Expression of data type :class:`String`. Contains null values if original
|
|
1487
|
-
value is null or the regex captures nothing.
|
|
1488
|
-
|
|
1489
|
-
Examples
|
|
1490
|
-
--------
|
|
1491
|
-
>>> df = pl.DataFrame(
|
|
1492
|
-
... {
|
|
1493
|
-
... "url": [
|
|
1494
|
-
... "http://vote.com/ballon_dor?error=404&ref=unknown",
|
|
1495
|
-
... "http://vote.com/ballon_dor?ref=polars&candidate=messi",
|
|
1496
|
-
... "http://vote.com/ballon_dor?candidate=ronaldo&ref=polars",
|
|
1497
|
-
... ]
|
|
1498
|
-
... }
|
|
1499
|
-
... )
|
|
1500
|
-
>>> df.select(
|
|
1501
|
-
... pl.col("url").str.extract(r"candidate=(\w+)", 1).alias("candidate"),
|
|
1502
|
-
... pl.col("url").str.extract(r"ref=(\w+)", 1).alias("referer"),
|
|
1503
|
-
... pl.col("url").str.extract(r"error=(\w+)", 1).alias("error"),
|
|
1504
|
-
... )
|
|
1505
|
-
shape: (3, 3)
|
|
1506
|
-
┌───────────┬─────────┬───────┐
|
|
1507
|
-
│ candidate ┆ referer ┆ error │
|
|
1508
|
-
│ --- ┆ --- ┆ --- │
|
|
1509
|
-
│ str ┆ str ┆ str │
|
|
1510
|
-
╞═══════════╪═════════╪═══════╡
|
|
1511
|
-
│ null ┆ unknown ┆ 404 │
|
|
1512
|
-
│ messi ┆ polars ┆ null │
|
|
1513
|
-
│ ronaldo ┆ polars ┆ null │
|
|
1514
|
-
└───────────┴─────────┴───────┘
|
|
1515
|
-
"""
|
|
1516
|
-
pattern_pyexpr = parse_into_expression(pattern, str_as_lit=True)
|
|
1517
|
-
return wrap_expr(self._pyexpr.str_extract(pattern_pyexpr, group_index))
|
|
1518
|
-
|
|
1519
|
-
def extract_all(self, pattern: str | Expr) -> Expr:
|
|
1520
|
-
r'''
|
|
1521
|
-
Extract all matches for the given regex pattern.
|
|
1522
|
-
|
|
1523
|
-
Extract each successive non-overlapping regex match in an individual string
|
|
1524
|
-
as a list. If the haystack string is `null`, `null` is returned.
|
|
1525
|
-
|
|
1526
|
-
Parameters
|
|
1527
|
-
----------
|
|
1528
|
-
pattern
|
|
1529
|
-
A valid regular expression pattern, compatible with the `regex crate
|
|
1530
|
-
<https://docs.rs/regex/latest/regex/>`_.
|
|
1531
|
-
|
|
1532
|
-
Notes
|
|
1533
|
-
-----
|
|
1534
|
-
To modify regular expression behaviour (such as "verbose" mode and/or
|
|
1535
|
-
case-sensitive matching) with flags, use the inline `(?iLmsuxU)` syntax.
|
|
1536
|
-
For example:
|
|
1537
|
-
|
|
1538
|
-
>>> df = pl.DataFrame(
|
|
1539
|
-
... data={
|
|
1540
|
-
... "email": [
|
|
1541
|
-
... "real.email@spam.com",
|
|
1542
|
-
... "some_account@somewhere.net",
|
|
1543
|
-
... "abc.def.ghi.jkl@uvw.xyz.co.uk",
|
|
1544
|
-
... ]
|
|
1545
|
-
... }
|
|
1546
|
-
... )
|
|
1547
|
-
>>> # extract name/domain parts from the addresses, using verbose regex
|
|
1548
|
-
>>> df.with_columns(
|
|
1549
|
-
... pl.col("email")
|
|
1550
|
-
... .str.extract_all(
|
|
1551
|
-
... r"""(?xi) # activate 'verbose' and 'case-insensitive' flags
|
|
1552
|
-
... [ # (start character group)
|
|
1553
|
-
... A-Z # letters
|
|
1554
|
-
... 0-9 # digits
|
|
1555
|
-
... ._%+\- # special chars
|
|
1556
|
-
... ] # (end character group)
|
|
1557
|
-
... + # 'one or more' quantifier
|
|
1558
|
-
... """
|
|
1559
|
-
... )
|
|
1560
|
-
... .list.to_struct(fields=["name", "domain"])
|
|
1561
|
-
... .alias("email_parts")
|
|
1562
|
-
... ).unnest("email_parts")
|
|
1563
|
-
shape: (3, 3)
|
|
1564
|
-
┌───────────────────────────────┬─────────────────┬───────────────┐
|
|
1565
|
-
│ email ┆ name ┆ domain │
|
|
1566
|
-
│ --- ┆ --- ┆ --- │
|
|
1567
|
-
│ str ┆ str ┆ str │
|
|
1568
|
-
╞═══════════════════════════════╪═════════════════╪═══════════════╡
|
|
1569
|
-
│ real.email@spam.com ┆ real.email ┆ spam.com │
|
|
1570
|
-
│ some_account@somewhere.net ┆ some_account ┆ somewhere.net │
|
|
1571
|
-
│ abc.def.ghi.jkl@uvw.xyz.co.uk ┆ abc.def.ghi.jkl ┆ uvw.xyz.co.uk │
|
|
1572
|
-
└───────────────────────────────┴─────────────────┴───────────────┘
|
|
1573
|
-
|
|
1574
|
-
See the regex crate's section on `grouping and flags
|
|
1575
|
-
<https://docs.rs/regex/latest/regex/#grouping-and-flags>`_ for
|
|
1576
|
-
additional information about the use of inline expression modifiers.
|
|
1577
|
-
|
|
1578
|
-
Returns
|
|
1579
|
-
-------
|
|
1580
|
-
Expr
|
|
1581
|
-
Expression of data type `List(String)`.
|
|
1582
|
-
|
|
1583
|
-
Examples
|
|
1584
|
-
--------
|
|
1585
|
-
>>> df = pl.DataFrame({"foo": ["123 bla 45 asd", "xyz 678 910t", "bar", None]})
|
|
1586
|
-
>>> df.select(
|
|
1587
|
-
... pl.col("foo").str.extract_all(r"\d+").alias("extracted_nrs"),
|
|
1588
|
-
... )
|
|
1589
|
-
shape: (4, 1)
|
|
1590
|
-
┌────────────────┐
|
|
1591
|
-
│ extracted_nrs │
|
|
1592
|
-
│ --- │
|
|
1593
|
-
│ list[str] │
|
|
1594
|
-
╞════════════════╡
|
|
1595
|
-
│ ["123", "45"] │
|
|
1596
|
-
│ ["678", "910"] │
|
|
1597
|
-
│ [] │
|
|
1598
|
-
│ null │
|
|
1599
|
-
└────────────────┘
|
|
1600
|
-
|
|
1601
|
-
'''
|
|
1602
|
-
pattern_pyexpr = parse_into_expression(pattern, str_as_lit=True)
|
|
1603
|
-
return wrap_expr(self._pyexpr.str_extract_all(pattern_pyexpr))
|
|
1604
|
-
|
|
1605
|
-
def extract_groups(self, pattern: str) -> Expr:
|
|
1606
|
-
r"""
|
|
1607
|
-
Extract all capture groups for the given regex pattern.
|
|
1608
|
-
|
|
1609
|
-
Parameters
|
|
1610
|
-
----------
|
|
1611
|
-
pattern
|
|
1612
|
-
A valid regular expression pattern containing at least one capture group,
|
|
1613
|
-
compatible with the `regex crate <https://docs.rs/regex/latest/regex/>`_.
|
|
1614
|
-
|
|
1615
|
-
Notes
|
|
1616
|
-
-----
|
|
1617
|
-
All group names are **strings**.
|
|
1618
|
-
|
|
1619
|
-
If your pattern contains unnamed groups, their numerical position is converted
|
|
1620
|
-
to a string.
|
|
1621
|
-
|
|
1622
|
-
For example, here we access groups 2 and 3 via the names `"2"` and `"3"`::
|
|
1623
|
-
|
|
1624
|
-
>>> df = pl.DataFrame({"col": ["foo bar baz"]})
|
|
1625
|
-
>>> (
|
|
1626
|
-
... df.with_columns(
|
|
1627
|
-
... pl.col("col").str.extract_groups(r"(\S+) (\S+) (.+)")
|
|
1628
|
-
... ).select(pl.col("col").struct["2"], pl.col("col").struct["3"])
|
|
1629
|
-
... )
|
|
1630
|
-
shape: (1, 2)
|
|
1631
|
-
┌─────┬─────┐
|
|
1632
|
-
│ 2 ┆ 3 │
|
|
1633
|
-
│ --- ┆ --- │
|
|
1634
|
-
│ str ┆ str │
|
|
1635
|
-
╞═════╪═════╡
|
|
1636
|
-
│ bar ┆ baz │
|
|
1637
|
-
└─────┴─────┘
|
|
1638
|
-
|
|
1639
|
-
Returns
|
|
1640
|
-
-------
|
|
1641
|
-
Expr
|
|
1642
|
-
Expression of data type :class:`Struct` with fields of data type
|
|
1643
|
-
:class:`String`.
|
|
1644
|
-
|
|
1645
|
-
Examples
|
|
1646
|
-
--------
|
|
1647
|
-
>>> df = pl.DataFrame(
|
|
1648
|
-
... data={
|
|
1649
|
-
... "url": [
|
|
1650
|
-
... "http://vote.com/ballon_dor?candidate=messi&ref=python",
|
|
1651
|
-
... "http://vote.com/ballon_dor?candidate=weghorst&ref=polars",
|
|
1652
|
-
... "http://vote.com/ballon_dor?error=404&ref=rust",
|
|
1653
|
-
... ]
|
|
1654
|
-
... }
|
|
1655
|
-
... )
|
|
1656
|
-
>>> pattern = r"candidate=(?<candidate>\w+)&ref=(?<ref>\w+)"
|
|
1657
|
-
>>> df.select(captures=pl.col("url").str.extract_groups(pattern)).unnest(
|
|
1658
|
-
... "captures"
|
|
1659
|
-
... )
|
|
1660
|
-
shape: (3, 2)
|
|
1661
|
-
┌───────────┬────────┐
|
|
1662
|
-
│ candidate ┆ ref │
|
|
1663
|
-
│ --- ┆ --- │
|
|
1664
|
-
│ str ┆ str │
|
|
1665
|
-
╞═══════════╪════════╡
|
|
1666
|
-
│ messi ┆ python │
|
|
1667
|
-
│ weghorst ┆ polars │
|
|
1668
|
-
│ null ┆ null │
|
|
1669
|
-
└───────────┴────────┘
|
|
1670
|
-
|
|
1671
|
-
Unnamed groups have their numerical position converted to a string:
|
|
1672
|
-
|
|
1673
|
-
>>> pattern = r"candidate=(\w+)&ref=(\w+)"
|
|
1674
|
-
>>> (
|
|
1675
|
-
... df.with_columns(
|
|
1676
|
-
... captures=pl.col("url").str.extract_groups(pattern)
|
|
1677
|
-
... ).with_columns(name=pl.col("captures").struct["1"].str.to_uppercase())
|
|
1678
|
-
... )
|
|
1679
|
-
shape: (3, 3)
|
|
1680
|
-
┌─────────────────────────────────┬───────────────────────┬──────────┐
|
|
1681
|
-
│ url ┆ captures ┆ name │
|
|
1682
|
-
│ --- ┆ --- ┆ --- │
|
|
1683
|
-
│ str ┆ struct[2] ┆ str │
|
|
1684
|
-
╞═════════════════════════════════╪═══════════════════════╪══════════╡
|
|
1685
|
-
│ http://vote.com/ballon_dor?can… ┆ {"messi","python"} ┆ MESSI │
|
|
1686
|
-
│ http://vote.com/ballon_dor?can… ┆ {"weghorst","polars"} ┆ WEGHORST │
|
|
1687
|
-
│ http://vote.com/ballon_dor?err… ┆ {null,null} ┆ null │
|
|
1688
|
-
└─────────────────────────────────┴───────────────────────┴──────────┘
|
|
1689
|
-
"""
|
|
1690
|
-
if not isinstance(pattern, str):
|
|
1691
|
-
msg = f'"extract_groups" expects a `str`, given a {qualified_type_name(pattern)!r}'
|
|
1692
|
-
raise TypeError(msg)
|
|
1693
|
-
return wrap_expr(self._pyexpr.str_extract_groups(pattern))
|
|
1694
|
-
|
|
1695
|
-
def count_matches(self, pattern: str | Expr, *, literal: bool = False) -> Expr:
|
|
1696
|
-
r"""
|
|
1697
|
-
Count all successive non-overlapping regex matches.
|
|
1698
|
-
|
|
1699
|
-
Parameters
|
|
1700
|
-
----------
|
|
1701
|
-
pattern
|
|
1702
|
-
A valid regular expression pattern, compatible with the `regex crate
|
|
1703
|
-
<https://docs.rs/regex/latest/regex/>`_.
|
|
1704
|
-
literal
|
|
1705
|
-
Treat `pattern` as a literal string, not as a regular expression.
|
|
1706
|
-
|
|
1707
|
-
Returns
|
|
1708
|
-
-------
|
|
1709
|
-
Expr
|
|
1710
|
-
Expression of data type :class:`UInt32`. Returns null if the
|
|
1711
|
-
original value is null.
|
|
1712
|
-
|
|
1713
|
-
Examples
|
|
1714
|
-
--------
|
|
1715
|
-
>>> df = pl.DataFrame({"foo": ["123 bla 45 asd", "xyz 678 910t", "bar", None]})
|
|
1716
|
-
>>> df.with_columns(
|
|
1717
|
-
... pl.col("foo").str.count_matches(r"\d").alias("count_digits"),
|
|
1718
|
-
... )
|
|
1719
|
-
shape: (4, 2)
|
|
1720
|
-
┌────────────────┬──────────────┐
|
|
1721
|
-
│ foo ┆ count_digits │
|
|
1722
|
-
│ --- ┆ --- │
|
|
1723
|
-
│ str ┆ u32 │
|
|
1724
|
-
╞════════════════╪══════════════╡
|
|
1725
|
-
│ 123 bla 45 asd ┆ 5 │
|
|
1726
|
-
│ xyz 678 910t ┆ 6 │
|
|
1727
|
-
│ bar ┆ 0 │
|
|
1728
|
-
│ null ┆ null │
|
|
1729
|
-
└────────────────┴──────────────┘
|
|
1730
|
-
|
|
1731
|
-
>>> df = pl.DataFrame({"bar": ["12 dbc 3xy", "cat\\w", "1zy3\\d\\d", None]})
|
|
1732
|
-
>>> df.with_columns(
|
|
1733
|
-
... pl.col("bar")
|
|
1734
|
-
... .str.count_matches(r"\d", literal=True)
|
|
1735
|
-
... .alias("count_digits"),
|
|
1736
|
-
... )
|
|
1737
|
-
shape: (4, 2)
|
|
1738
|
-
┌────────────┬──────────────┐
|
|
1739
|
-
│ bar ┆ count_digits │
|
|
1740
|
-
│ --- ┆ --- │
|
|
1741
|
-
│ str ┆ u32 │
|
|
1742
|
-
╞════════════╪══════════════╡
|
|
1743
|
-
│ 12 dbc 3xy ┆ 0 │
|
|
1744
|
-
│ cat\w ┆ 0 │
|
|
1745
|
-
│ 1zy3\d\d ┆ 2 │
|
|
1746
|
-
│ null ┆ null │
|
|
1747
|
-
└────────────┴──────────────┘
|
|
1748
|
-
"""
|
|
1749
|
-
pattern_pyexpr = parse_into_expression(pattern, str_as_lit=True)
|
|
1750
|
-
return wrap_expr(self._pyexpr.str_count_matches(pattern_pyexpr, literal))
|
|
1751
|
-
|
|
1752
|
-
def split(self, by: IntoExpr, *, inclusive: bool = False) -> Expr:
|
|
1753
|
-
"""
|
|
1754
|
-
Split the string by a substring.
|
|
1755
|
-
|
|
1756
|
-
Parameters
|
|
1757
|
-
----------
|
|
1758
|
-
by
|
|
1759
|
-
Substring to split by.
|
|
1760
|
-
inclusive
|
|
1761
|
-
If True, include the split character/string in the results.
|
|
1762
|
-
|
|
1763
|
-
Examples
|
|
1764
|
-
--------
|
|
1765
|
-
>>> df = pl.DataFrame({"s": ["foo bar", "foo_bar", "foo_bar_baz"]})
|
|
1766
|
-
>>> df.with_columns(
|
|
1767
|
-
... pl.col("s").str.split(by="_").alias("split"),
|
|
1768
|
-
... pl.col("s").str.split(by="_", inclusive=True).alias("split_inclusive"),
|
|
1769
|
-
... )
|
|
1770
|
-
shape: (3, 3)
|
|
1771
|
-
┌─────────────┬───────────────────────┬─────────────────────────┐
|
|
1772
|
-
│ s ┆ split ┆ split_inclusive │
|
|
1773
|
-
│ --- ┆ --- ┆ --- │
|
|
1774
|
-
│ str ┆ list[str] ┆ list[str] │
|
|
1775
|
-
╞═════════════╪═══════════════════════╪═════════════════════════╡
|
|
1776
|
-
│ foo bar ┆ ["foo bar"] ┆ ["foo bar"] │
|
|
1777
|
-
│ foo_bar ┆ ["foo", "bar"] ┆ ["foo_", "bar"] │
|
|
1778
|
-
│ foo_bar_baz ┆ ["foo", "bar", "baz"] ┆ ["foo_", "bar_", "baz"] │
|
|
1779
|
-
└─────────────┴───────────────────────┴─────────────────────────┘
|
|
1780
|
-
|
|
1781
|
-
>>> df = pl.DataFrame(
|
|
1782
|
-
... {"s": ["foo^bar", "foo_bar", "foo*bar*baz"], "by": ["_", "_", "*"]}
|
|
1783
|
-
... )
|
|
1784
|
-
>>> df.with_columns(
|
|
1785
|
-
... pl.col("s").str.split(by=pl.col("by")).alias("split"),
|
|
1786
|
-
... pl.col("s")
|
|
1787
|
-
... .str.split(by=pl.col("by"), inclusive=True)
|
|
1788
|
-
... .alias("split_inclusive"),
|
|
1789
|
-
... )
|
|
1790
|
-
shape: (3, 4)
|
|
1791
|
-
┌─────────────┬─────┬───────────────────────┬─────────────────────────┐
|
|
1792
|
-
│ s ┆ by ┆ split ┆ split_inclusive │
|
|
1793
|
-
│ --- ┆ --- ┆ --- ┆ --- │
|
|
1794
|
-
│ str ┆ str ┆ list[str] ┆ list[str] │
|
|
1795
|
-
╞═════════════╪═════╪═══════════════════════╪═════════════════════════╡
|
|
1796
|
-
│ foo^bar ┆ _ ┆ ["foo^bar"] ┆ ["foo^bar"] │
|
|
1797
|
-
│ foo_bar ┆ _ ┆ ["foo", "bar"] ┆ ["foo_", "bar"] │
|
|
1798
|
-
│ foo*bar*baz ┆ * ┆ ["foo", "bar", "baz"] ┆ ["foo*", "bar*", "baz"] │
|
|
1799
|
-
└─────────────┴─────┴───────────────────────┴─────────────────────────┘
|
|
1800
|
-
|
|
1801
|
-
Returns
|
|
1802
|
-
-------
|
|
1803
|
-
Expr
|
|
1804
|
-
Expression of data type :class:`String`.
|
|
1805
|
-
"""
|
|
1806
|
-
by_pyexpr = parse_into_expression(by, str_as_lit=True)
|
|
1807
|
-
if inclusive:
|
|
1808
|
-
return wrap_expr(self._pyexpr.str_split_inclusive(by_pyexpr))
|
|
1809
|
-
return wrap_expr(self._pyexpr.str_split(by_pyexpr))
|
|
1810
|
-
|
|
1811
|
-
def split_exact(self, by: IntoExpr, n: int, *, inclusive: bool = False) -> Expr:
|
|
1812
|
-
"""
|
|
1813
|
-
Split the string by a substring using `n` splits.
|
|
1814
|
-
|
|
1815
|
-
Results in a struct of `n+1` fields.
|
|
1816
|
-
|
|
1817
|
-
If it cannot make `n` splits, the remaining field elements will be null.
|
|
1818
|
-
|
|
1819
|
-
Parameters
|
|
1820
|
-
----------
|
|
1821
|
-
by
|
|
1822
|
-
Substring to split by.
|
|
1823
|
-
n
|
|
1824
|
-
Number of splits to make.
|
|
1825
|
-
inclusive
|
|
1826
|
-
If True, include the split character/string in the results.
|
|
1827
|
-
|
|
1828
|
-
Returns
|
|
1829
|
-
-------
|
|
1830
|
-
Expr
|
|
1831
|
-
Expression of data type :class:`Struct` with fields of data type
|
|
1832
|
-
:class:`String`.
|
|
1833
|
-
|
|
1834
|
-
Examples
|
|
1835
|
-
--------
|
|
1836
|
-
>>> df = pl.DataFrame({"x": ["a_1", None, "c", "d_4"]})
|
|
1837
|
-
>>> df.with_columns(
|
|
1838
|
-
... extracted=pl.col("x").str.split_exact("_", 1).alias("fields"),
|
|
1839
|
-
... )
|
|
1840
|
-
shape: (4, 2)
|
|
1841
|
-
┌──────┬─────────────┐
|
|
1842
|
-
│ x ┆ extracted │
|
|
1843
|
-
│ --- ┆ --- │
|
|
1844
|
-
│ str ┆ struct[2] │
|
|
1845
|
-
╞══════╪═════════════╡
|
|
1846
|
-
│ a_1 ┆ {"a","1"} │
|
|
1847
|
-
│ null ┆ {null,null} │
|
|
1848
|
-
│ c ┆ {"c",null} │
|
|
1849
|
-
│ d_4 ┆ {"d","4"} │
|
|
1850
|
-
└──────┴─────────────┘
|
|
1851
|
-
|
|
1852
|
-
|
|
1853
|
-
Split string values in column x in exactly 2 parts and assign
|
|
1854
|
-
each part to a new column.
|
|
1855
|
-
|
|
1856
|
-
>>> df.with_columns(
|
|
1857
|
-
... pl.col("x")
|
|
1858
|
-
... .str.split_exact("_", 1)
|
|
1859
|
-
... .struct.rename_fields(["first_part", "second_part"])
|
|
1860
|
-
... .alias("fields")
|
|
1861
|
-
... ).unnest("fields")
|
|
1862
|
-
shape: (4, 3)
|
|
1863
|
-
┌──────┬────────────┬─────────────┐
|
|
1864
|
-
│ x ┆ first_part ┆ second_part │
|
|
1865
|
-
│ --- ┆ --- ┆ --- │
|
|
1866
|
-
│ str ┆ str ┆ str │
|
|
1867
|
-
╞══════╪════════════╪═════════════╡
|
|
1868
|
-
│ a_1 ┆ a ┆ 1 │
|
|
1869
|
-
│ null ┆ null ┆ null │
|
|
1870
|
-
│ c ┆ c ┆ null │
|
|
1871
|
-
│ d_4 ┆ d ┆ 4 │
|
|
1872
|
-
└──────┴────────────┴─────────────┘
|
|
1873
|
-
"""
|
|
1874
|
-
by_pyexpr = parse_into_expression(by, str_as_lit=True)
|
|
1875
|
-
if inclusive:
|
|
1876
|
-
return wrap_expr(self._pyexpr.str_split_exact_inclusive(by_pyexpr, n))
|
|
1877
|
-
return wrap_expr(self._pyexpr.str_split_exact(by_pyexpr, n))
|
|
1878
|
-
|
|
1879
|
-
def splitn(self, by: IntoExpr, n: int) -> Expr:
|
|
1880
|
-
"""
|
|
1881
|
-
Split the string by a substring, restricted to returning at most `n` items.
|
|
1882
|
-
|
|
1883
|
-
If the number of possible splits is less than `n-1`, the remaining field
|
|
1884
|
-
elements will be null. If the number of possible splits is `n-1` or greater,
|
|
1885
|
-
the last (nth) substring will contain the remainder of the string.
|
|
1886
|
-
|
|
1887
|
-
Parameters
|
|
1888
|
-
----------
|
|
1889
|
-
by
|
|
1890
|
-
Substring to split by.
|
|
1891
|
-
n
|
|
1892
|
-
Max number of items to return.
|
|
1893
|
-
|
|
1894
|
-
Returns
|
|
1895
|
-
-------
|
|
1896
|
-
Expr
|
|
1897
|
-
Expression of data type :class:`Struct` with fields of data type
|
|
1898
|
-
:class:`String`.
|
|
1899
|
-
|
|
1900
|
-
Examples
|
|
1901
|
-
--------
|
|
1902
|
-
>>> df = pl.DataFrame({"s": ["foo bar", None, "foo-bar", "foo bar baz"]})
|
|
1903
|
-
>>> df.with_columns(pl.col("s").str.splitn(" ", 2).alias("fields"))
|
|
1904
|
-
shape: (4, 2)
|
|
1905
|
-
┌─────────────┬───────────────────┐
|
|
1906
|
-
│ s ┆ fields │
|
|
1907
|
-
│ --- ┆ --- │
|
|
1908
|
-
│ str ┆ struct[2] │
|
|
1909
|
-
╞═════════════╪═══════════════════╡
|
|
1910
|
-
│ foo bar ┆ {"foo","bar"} │
|
|
1911
|
-
│ null ┆ {null,null} │
|
|
1912
|
-
│ foo-bar ┆ {"foo-bar",null} │
|
|
1913
|
-
│ foo bar baz ┆ {"foo","bar baz"} │
|
|
1914
|
-
└─────────────┴───────────────────┘
|
|
1915
|
-
|
|
1916
|
-
Split string values in column s in exactly 2 parts and assign
|
|
1917
|
-
each part to a new column.
|
|
1918
|
-
|
|
1919
|
-
>>> df.with_columns(
|
|
1920
|
-
... pl.col("s")
|
|
1921
|
-
... .str.splitn(" ", 2)
|
|
1922
|
-
... .struct.rename_fields(["first_part", "second_part"])
|
|
1923
|
-
... .alias("fields")
|
|
1924
|
-
... ).unnest("fields")
|
|
1925
|
-
shape: (4, 3)
|
|
1926
|
-
┌─────────────┬────────────┬─────────────┐
|
|
1927
|
-
│ s ┆ first_part ┆ second_part │
|
|
1928
|
-
│ --- ┆ --- ┆ --- │
|
|
1929
|
-
│ str ┆ str ┆ str │
|
|
1930
|
-
╞═════════════╪════════════╪═════════════╡
|
|
1931
|
-
│ foo bar ┆ foo ┆ bar │
|
|
1932
|
-
│ null ┆ null ┆ null │
|
|
1933
|
-
│ foo-bar ┆ foo-bar ┆ null │
|
|
1934
|
-
│ foo bar baz ┆ foo ┆ bar baz │
|
|
1935
|
-
└─────────────┴────────────┴─────────────┘
|
|
1936
|
-
"""
|
|
1937
|
-
by_pyexpr = parse_into_expression(by, str_as_lit=True)
|
|
1938
|
-
return wrap_expr(self._pyexpr.str_splitn(by_pyexpr, n))
|
|
1939
|
-
|
|
1940
|
-
def replace(
|
|
1941
|
-
self,
|
|
1942
|
-
pattern: str | Expr,
|
|
1943
|
-
value: str | Expr,
|
|
1944
|
-
*,
|
|
1945
|
-
literal: bool = False,
|
|
1946
|
-
n: int = 1,
|
|
1947
|
-
) -> Expr:
|
|
1948
|
-
r"""
|
|
1949
|
-
Replace first matching regex/literal substring with a new string value.
|
|
1950
|
-
|
|
1951
|
-
Parameters
|
|
1952
|
-
----------
|
|
1953
|
-
pattern
|
|
1954
|
-
A valid regular expression pattern, compatible with the `regex crate
|
|
1955
|
-
<https://docs.rs/regex/latest/regex/>`_.
|
|
1956
|
-
value
|
|
1957
|
-
String that will replace the matched substring.
|
|
1958
|
-
literal
|
|
1959
|
-
Treat `pattern` as a literal string.
|
|
1960
|
-
n
|
|
1961
|
-
Number of matches to replace.
|
|
1962
|
-
|
|
1963
|
-
See Also
|
|
1964
|
-
--------
|
|
1965
|
-
replace_all
|
|
1966
|
-
|
|
1967
|
-
Notes
|
|
1968
|
-
-----
|
|
1969
|
-
* To modify regular expression behaviour (such as case-sensitivity) with flags,
|
|
1970
|
-
use the inline `(?iLmsuxU)` syntax. See the regex crate's section on
|
|
1971
|
-
`grouping and flags <https://docs.rs/regex/latest/regex/#grouping-and-flags>`_
|
|
1972
|
-
for additional information about the use of inline expression modifiers.
|
|
1973
|
-
|
|
1974
|
-
* The dollar sign (`$`) is a special character related to capture groups; if you
|
|
1975
|
-
want to replace some target pattern with characters that include a literal `$`
|
|
1976
|
-
you should escape it by doubling it up as `$$`, or set `literal=True` if you
|
|
1977
|
-
do not need a full regular expression pattern match. Otherwise, you will be
|
|
1978
|
-
referencing a (potentially non-existent) capture group.
|
|
1979
|
-
|
|
1980
|
-
In the example below we need to double up `$` (to represent a literal dollar
|
|
1981
|
-
sign, and then refer to the capture group using `$n` or `${n}`, hence the
|
|
1982
|
-
three consecutive `$` characters in the replacement value:
|
|
1983
|
-
|
|
1984
|
-
.. code-block:: python
|
|
1985
|
-
|
|
1986
|
-
>>> df = pl.DataFrame({"cost": ["#12.34", "#56.78"]})
|
|
1987
|
-
>>> df.with_columns(
|
|
1988
|
-
... cost_usd=pl.col("cost").str.replace(r"#(\d+)", "$$${1}")
|
|
1989
|
-
... )
|
|
1990
|
-
shape: (2, 2)
|
|
1991
|
-
┌────────┬──────────┐
|
|
1992
|
-
│ cost ┆ cost_usd │
|
|
1993
|
-
│ --- ┆ --- │
|
|
1994
|
-
│ str ┆ str │
|
|
1995
|
-
╞════════╪══════════╡
|
|
1996
|
-
│ #12.34 ┆ $12.34 │
|
|
1997
|
-
│ #56.78 ┆ $56.78 │
|
|
1998
|
-
└────────┴──────────┘
|
|
1999
|
-
|
|
2000
|
-
Examples
|
|
2001
|
-
--------
|
|
2002
|
-
>>> df = pl.DataFrame({"id": [1, 2], "text": ["123abc", "abc456"]})
|
|
2003
|
-
>>> df.with_columns(pl.col("text").str.replace(r"abc\b", "ABC"))
|
|
2004
|
-
shape: (2, 2)
|
|
2005
|
-
┌─────┬────────┐
|
|
2006
|
-
│ id ┆ text │
|
|
2007
|
-
│ --- ┆ --- │
|
|
2008
|
-
│ i64 ┆ str │
|
|
2009
|
-
╞═════╪════════╡
|
|
2010
|
-
│ 1 ┆ 123ABC │
|
|
2011
|
-
│ 2 ┆ abc456 │
|
|
2012
|
-
└─────┴────────┘
|
|
2013
|
-
|
|
2014
|
-
Capture groups are supported. Use `$1` or `${1}` in the `value` string to refer
|
|
2015
|
-
to the first capture group in the `pattern`, `$2` or `${2}` to refer to the
|
|
2016
|
-
second capture group, and so on. You can also use *named* capture groups.
|
|
2017
|
-
|
|
2018
|
-
>>> df = pl.DataFrame({"word": ["hat", "hut"]})
|
|
2019
|
-
>>> df.with_columns(
|
|
2020
|
-
... positional=pl.col.word.str.replace("h(.)t", "b${1}d"),
|
|
2021
|
-
... named=pl.col.word.str.replace("h(?<vowel>.)t", "b${vowel}d"),
|
|
2022
|
-
... )
|
|
2023
|
-
shape: (2, 3)
|
|
2024
|
-
┌──────┬────────────┬───────┐
|
|
2025
|
-
│ word ┆ positional ┆ named │
|
|
2026
|
-
│ --- ┆ --- ┆ --- │
|
|
2027
|
-
│ str ┆ str ┆ str │
|
|
2028
|
-
╞══════╪════════════╪═══════╡
|
|
2029
|
-
│ hat ┆ bad ┆ bad │
|
|
2030
|
-
│ hut ┆ bud ┆ bud │
|
|
2031
|
-
└──────┴────────────┴───────┘
|
|
2032
|
-
|
|
2033
|
-
Apply case-insensitive string replacement using the `(?i)` flag.
|
|
2034
|
-
|
|
2035
|
-
>>> df = pl.DataFrame(
|
|
2036
|
-
... {
|
|
2037
|
-
... "city": "Philadelphia",
|
|
2038
|
-
... "season": ["Spring", "Summer", "Autumn", "Winter"],
|
|
2039
|
-
... "weather": ["Rainy", "Sunny", "Cloudy", "Snowy"],
|
|
2040
|
-
... }
|
|
2041
|
-
... )
|
|
2042
|
-
>>> df.with_columns(
|
|
2043
|
-
... pl.col("weather").str.replace(r"(?i)foggy|rainy|cloudy|snowy", "Sunny")
|
|
2044
|
-
... )
|
|
2045
|
-
shape: (4, 3)
|
|
2046
|
-
┌──────────────┬────────┬─────────┐
|
|
2047
|
-
│ city ┆ season ┆ weather │
|
|
2048
|
-
│ --- ┆ --- ┆ --- │
|
|
2049
|
-
│ str ┆ str ┆ str │
|
|
2050
|
-
╞══════════════╪════════╪═════════╡
|
|
2051
|
-
│ Philadelphia ┆ Spring ┆ Sunny │
|
|
2052
|
-
│ Philadelphia ┆ Summer ┆ Sunny │
|
|
2053
|
-
│ Philadelphia ┆ Autumn ┆ Sunny │
|
|
2054
|
-
│ Philadelphia ┆ Winter ┆ Sunny │
|
|
2055
|
-
└──────────────┴────────┴─────────┘
|
|
2056
|
-
"""
|
|
2057
|
-
pattern_pyexpr = parse_into_expression(pattern, str_as_lit=True)
|
|
2058
|
-
value_pyexpr = parse_into_expression(value, str_as_lit=True)
|
|
2059
|
-
return wrap_expr(
|
|
2060
|
-
self._pyexpr.str_replace_n(pattern_pyexpr, value_pyexpr, literal, n)
|
|
2061
|
-
)
|
|
2062
|
-
|
|
2063
|
-
def replace_all(
|
|
2064
|
-
self, pattern: str | Expr, value: str | Expr, *, literal: bool = False
|
|
2065
|
-
) -> Expr:
|
|
2066
|
-
r"""
|
|
2067
|
-
Replace all matching regex/literal substrings with a new string value.
|
|
2068
|
-
|
|
2069
|
-
Parameters
|
|
2070
|
-
----------
|
|
2071
|
-
pattern
|
|
2072
|
-
A valid regular expression pattern, compatible with the `regex crate
|
|
2073
|
-
<https://docs.rs/regex/latest/regex/>`_.
|
|
2074
|
-
value
|
|
2075
|
-
String that will replace the matched substring.
|
|
2076
|
-
literal
|
|
2077
|
-
Treat `pattern` as a literal string.
|
|
2078
|
-
|
|
2079
|
-
See Also
|
|
2080
|
-
--------
|
|
2081
|
-
replace
|
|
2082
|
-
|
|
2083
|
-
Notes
|
|
2084
|
-
-----
|
|
2085
|
-
* To modify regular expression behaviour (such as case-sensitivity) with flags,
|
|
2086
|
-
use the inline `(?iLmsuxU)` syntax. See the regex crate's section on
|
|
2087
|
-
`grouping and flags <https://docs.rs/regex/latest/regex/#grouping-and-flags>`_
|
|
2088
|
-
for additional information about the use of inline expression modifiers.
|
|
2089
|
-
|
|
2090
|
-
* The dollar sign (`$`) is a special character related to capture groups; if you
|
|
2091
|
-
want to replace some target pattern with characters that include a literal `$`
|
|
2092
|
-
you should escape it by doubling it up as `$$`, or set `literal=True` if you
|
|
2093
|
-
do not need a full regular expression pattern match. Otherwise, you will be
|
|
2094
|
-
referencing a (potentially non-existent) capture group.
|
|
2095
|
-
|
|
2096
|
-
In the example below we need to double up `$` to represent a literal dollar
|
|
2097
|
-
sign, otherwise we are referring to a capture group (which may or may not
|
|
2098
|
-
exist):
|
|
2099
|
-
|
|
2100
|
-
.. code-block:: python
|
|
2101
|
-
|
|
2102
|
-
>>> df = pl.DataFrame({"text": ["ab12cd34ef", "gh45ij67kl"]})
|
|
2103
|
-
>>> df.with_columns(
|
|
2104
|
-
... # the replacement pattern refers back to the capture group
|
|
2105
|
-
... text1=pl.col("text").str.replace_all(r"(?<N>\d{2,})", "$N$"),
|
|
2106
|
-
... # doubling-up the `$` results in it appearing as a literal value
|
|
2107
|
-
... text2=pl.col("text").str.replace_all(r"(?<N>\d{2,})", "$$N$$"),
|
|
2108
|
-
... )
|
|
2109
|
-
shape: (2, 3)
|
|
2110
|
-
┌────────────┬──────────────┬──────────────┐
|
|
2111
|
-
│ text ┆ text1 ┆ text2 │
|
|
2112
|
-
│ --- ┆ --- ┆ --- │
|
|
2113
|
-
│ str ┆ str ┆ str │
|
|
2114
|
-
╞════════════╪══════════════╪══════════════╡
|
|
2115
|
-
│ ab12cd34ef ┆ ab12$cd34$ef ┆ ab$N$cd$N$ef │
|
|
2116
|
-
│ gh45ij67kl ┆ gh45$ij67$kl ┆ gh$N$ij$N$kl │
|
|
2117
|
-
└────────────┴──────────────┴──────────────┘
|
|
2118
|
-
|
|
2119
|
-
Examples
|
|
2120
|
-
--------
|
|
2121
|
-
>>> df = pl.DataFrame({"id": [1, 2], "text": ["abcabc", "123a123"]})
|
|
2122
|
-
>>> df.with_columns(pl.col("text").str.replace_all("a", "-"))
|
|
2123
|
-
shape: (2, 2)
|
|
2124
|
-
┌─────┬─────────┐
|
|
2125
|
-
│ id ┆ text │
|
|
2126
|
-
│ --- ┆ --- │
|
|
2127
|
-
│ i64 ┆ str │
|
|
2128
|
-
╞═════╪═════════╡
|
|
2129
|
-
│ 1 ┆ -bc-bc │
|
|
2130
|
-
│ 2 ┆ 123-123 │
|
|
2131
|
-
└─────┴─────────┘
|
|
2132
|
-
|
|
2133
|
-
Capture groups are supported. Use `$1` or `${1}` in the `value` string to refer
|
|
2134
|
-
to the first capture group in the `pattern`, `$2` or `${2}` to refer to the
|
|
2135
|
-
second capture group, and so on. You can also use *named* capture groups.
|
|
2136
|
-
|
|
2137
|
-
>>> df = pl.DataFrame({"word": ["hat", "hut"]})
|
|
2138
|
-
>>> df.with_columns(
|
|
2139
|
-
... positional=pl.col.word.str.replace_all("h(.)t", "b${1}d"),
|
|
2140
|
-
... named=pl.col.word.str.replace_all("h(?<vowel>.)t", "b${vowel}d"),
|
|
2141
|
-
... )
|
|
2142
|
-
shape: (2, 3)
|
|
2143
|
-
┌──────┬────────────┬───────┐
|
|
2144
|
-
│ word ┆ positional ┆ named │
|
|
2145
|
-
│ --- ┆ --- ┆ --- │
|
|
2146
|
-
│ str ┆ str ┆ str │
|
|
2147
|
-
╞══════╪════════════╪═══════╡
|
|
2148
|
-
│ hat ┆ bad ┆ bad │
|
|
2149
|
-
│ hut ┆ bud ┆ bud │
|
|
2150
|
-
└──────┴────────────┴───────┘
|
|
2151
|
-
|
|
2152
|
-
Apply case-insensitive string replacement using the `(?i)` flag.
|
|
2153
|
-
|
|
2154
|
-
>>> df = pl.DataFrame(
|
|
2155
|
-
... {
|
|
2156
|
-
... "city": "Philadelphia",
|
|
2157
|
-
... "season": ["Spring", "Summer", "Autumn", "Winter"],
|
|
2158
|
-
... "weather": ["Rainy", "Sunny", "Cloudy", "Snowy"],
|
|
2159
|
-
... }
|
|
2160
|
-
... )
|
|
2161
|
-
>>> df.with_columns(
|
|
2162
|
-
... # apply case-insensitive string replacement
|
|
2163
|
-
... pl.col("weather").str.replace_all(
|
|
2164
|
-
... r"(?i)foggy|rainy|cloudy|snowy", "Sunny"
|
|
2165
|
-
... )
|
|
2166
|
-
... )
|
|
2167
|
-
shape: (4, 3)
|
|
2168
|
-
┌──────────────┬────────┬─────────┐
|
|
2169
|
-
│ city ┆ season ┆ weather │
|
|
2170
|
-
│ --- ┆ --- ┆ --- │
|
|
2171
|
-
│ str ┆ str ┆ str │
|
|
2172
|
-
╞══════════════╪════════╪═════════╡
|
|
2173
|
-
│ Philadelphia ┆ Spring ┆ Sunny │
|
|
2174
|
-
│ Philadelphia ┆ Summer ┆ Sunny │
|
|
2175
|
-
│ Philadelphia ┆ Autumn ┆ Sunny │
|
|
2176
|
-
│ Philadelphia ┆ Winter ┆ Sunny │
|
|
2177
|
-
└──────────────┴────────┴─────────┘
|
|
2178
|
-
"""
|
|
2179
|
-
pattern_pyexpr = parse_into_expression(pattern, str_as_lit=True)
|
|
2180
|
-
value_pyexpr = parse_into_expression(value, str_as_lit=True)
|
|
2181
|
-
return wrap_expr(
|
|
2182
|
-
self._pyexpr.str_replace_all(pattern_pyexpr, value_pyexpr, literal)
|
|
2183
|
-
)
|
|
2184
|
-
|
|
2185
|
-
def reverse(self) -> Expr:
|
|
2186
|
-
"""
|
|
2187
|
-
Returns string values in reversed order.
|
|
2188
|
-
|
|
2189
|
-
Examples
|
|
2190
|
-
--------
|
|
2191
|
-
>>> df = pl.DataFrame({"text": ["foo", "bar", "man\u0303ana"]})
|
|
2192
|
-
>>> df.with_columns(pl.col("text").str.reverse().alias("reversed"))
|
|
2193
|
-
shape: (3, 2)
|
|
2194
|
-
┌────────┬──────────┐
|
|
2195
|
-
│ text ┆ reversed │
|
|
2196
|
-
│ --- ┆ --- │
|
|
2197
|
-
│ str ┆ str │
|
|
2198
|
-
╞════════╪══════════╡
|
|
2199
|
-
│ foo ┆ oof │
|
|
2200
|
-
│ bar ┆ rab │
|
|
2201
|
-
│ mañana ┆ anañam │
|
|
2202
|
-
└────────┴──────────┘
|
|
2203
|
-
"""
|
|
2204
|
-
return wrap_expr(self._pyexpr.str_reverse())
|
|
2205
|
-
|
|
2206
|
-
def slice(
|
|
2207
|
-
self, offset: int | IntoExprColumn, length: int | IntoExprColumn | None = None
|
|
2208
|
-
) -> Expr:
|
|
2209
|
-
"""
|
|
2210
|
-
Extract a substring from each string value.
|
|
2211
|
-
|
|
2212
|
-
Parameters
|
|
2213
|
-
----------
|
|
2214
|
-
offset
|
|
2215
|
-
Start index. Negative indexing is supported.
|
|
2216
|
-
length
|
|
2217
|
-
Length of the slice. If set to `None` (default), the slice is taken to the
|
|
2218
|
-
end of the string.
|
|
2219
|
-
|
|
2220
|
-
Returns
|
|
2221
|
-
-------
|
|
2222
|
-
Expr
|
|
2223
|
-
Expression of data type :class:`String`.
|
|
2224
|
-
|
|
2225
|
-
Notes
|
|
2226
|
-
-----
|
|
2227
|
-
Both the `offset` and `length` inputs are defined in terms of the number
|
|
2228
|
-
of characters in the (UTF8) string. A character is defined as a
|
|
2229
|
-
`Unicode scalar value`_. A single character is represented by a single byte
|
|
2230
|
-
when working with ASCII text, and a maximum of 4 bytes otherwise.
|
|
2231
|
-
|
|
2232
|
-
.. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value
|
|
2233
|
-
|
|
2234
|
-
Examples
|
|
2235
|
-
--------
|
|
2236
|
-
>>> df = pl.DataFrame({"s": ["pear", None, "papaya", "dragonfruit"]})
|
|
2237
|
-
>>> df.with_columns(pl.col("s").str.slice(-3).alias("slice"))
|
|
2238
|
-
shape: (4, 2)
|
|
2239
|
-
┌─────────────┬───────┐
|
|
2240
|
-
│ s ┆ slice │
|
|
2241
|
-
│ --- ┆ --- │
|
|
2242
|
-
│ str ┆ str │
|
|
2243
|
-
╞═════════════╪═══════╡
|
|
2244
|
-
│ pear ┆ ear │
|
|
2245
|
-
│ null ┆ null │
|
|
2246
|
-
│ papaya ┆ aya │
|
|
2247
|
-
│ dragonfruit ┆ uit │
|
|
2248
|
-
└─────────────┴───────┘
|
|
2249
|
-
|
|
2250
|
-
Using the optional `length` parameter
|
|
2251
|
-
|
|
2252
|
-
>>> df.with_columns(pl.col("s").str.slice(4, length=3).alias("slice"))
|
|
2253
|
-
shape: (4, 2)
|
|
2254
|
-
┌─────────────┬───────┐
|
|
2255
|
-
│ s ┆ slice │
|
|
2256
|
-
│ --- ┆ --- │
|
|
2257
|
-
│ str ┆ str │
|
|
2258
|
-
╞═════════════╪═══════╡
|
|
2259
|
-
│ pear ┆ │
|
|
2260
|
-
│ null ┆ null │
|
|
2261
|
-
│ papaya ┆ ya │
|
|
2262
|
-
│ dragonfruit ┆ onf │
|
|
2263
|
-
└─────────────┴───────┘
|
|
2264
|
-
"""
|
|
2265
|
-
offset_pyexpr = parse_into_expression(offset)
|
|
2266
|
-
length_pyexpr = parse_into_expression(length)
|
|
2267
|
-
return wrap_expr(self._pyexpr.str_slice(offset_pyexpr, length_pyexpr))
|
|
2268
|
-
|
|
2269
|
-
def head(self, n: int | IntoExprColumn) -> Expr:
|
|
2270
|
-
"""
|
|
2271
|
-
Return the first n characters of each string in a String Series.
|
|
2272
|
-
|
|
2273
|
-
Parameters
|
|
2274
|
-
----------
|
|
2275
|
-
n
|
|
2276
|
-
Length of the slice (integer or expression). Negative indexing is supported;
|
|
2277
|
-
see note (2) below.
|
|
2278
|
-
|
|
2279
|
-
Returns
|
|
2280
|
-
-------
|
|
2281
|
-
Expr
|
|
2282
|
-
Expression of data type :class:`String`.
|
|
2283
|
-
|
|
2284
|
-
Notes
|
|
2285
|
-
-----
|
|
2286
|
-
1) The `n` input is defined in terms of the number of characters in the (UTF8)
|
|
2287
|
-
string. A character is defined as a `Unicode scalar value`_. A single
|
|
2288
|
-
character is represented by a single byte when working with ASCII text, and a
|
|
2289
|
-
maximum of 4 bytes otherwise.
|
|
2290
|
-
|
|
2291
|
-
.. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value
|
|
2292
|
-
|
|
2293
|
-
2) When the `n` input is negative, `head` returns characters up to the `n`th
|
|
2294
|
-
from the end of the string. For example, if `n = -3`, then all characters
|
|
2295
|
-
except the last three are returned.
|
|
2296
|
-
|
|
2297
|
-
3) If the length of the string has fewer than `n` characters, the full string is
|
|
2298
|
-
returned.
|
|
2299
|
-
|
|
2300
|
-
Examples
|
|
2301
|
-
--------
|
|
2302
|
-
Return up to the first 5 characters:
|
|
2303
|
-
|
|
2304
|
-
>>> df = pl.DataFrame({"s": ["pear", None, "papaya", "dragonfruit"]})
|
|
2305
|
-
>>> df.with_columns(pl.col("s").str.head(5).alias("s_head_5"))
|
|
2306
|
-
shape: (4, 2)
|
|
2307
|
-
┌─────────────┬──────────┐
|
|
2308
|
-
│ s ┆ s_head_5 │
|
|
2309
|
-
│ --- ┆ --- │
|
|
2310
|
-
│ str ┆ str │
|
|
2311
|
-
╞═════════════╪══════════╡
|
|
2312
|
-
│ pear ┆ pear │
|
|
2313
|
-
│ null ┆ null │
|
|
2314
|
-
│ papaya ┆ papay │
|
|
2315
|
-
│ dragonfruit ┆ drago │
|
|
2316
|
-
└─────────────┴──────────┘
|
|
2317
|
-
|
|
2318
|
-
Return characters determined by column `n`:
|
|
2319
|
-
|
|
2320
|
-
>>> df = pl.DataFrame(
|
|
2321
|
-
... {
|
|
2322
|
-
... "s": ["pear", None, "papaya", "dragonfruit"],
|
|
2323
|
-
... "n": [3, 4, -2, -5],
|
|
2324
|
-
... }
|
|
2325
|
-
... )
|
|
2326
|
-
>>> df.with_columns(pl.col("s").str.head("n").alias("s_head_n"))
|
|
2327
|
-
shape: (4, 3)
|
|
2328
|
-
┌─────────────┬─────┬──────────┐
|
|
2329
|
-
│ s ┆ n ┆ s_head_n │
|
|
2330
|
-
│ --- ┆ --- ┆ --- │
|
|
2331
|
-
│ str ┆ i64 ┆ str │
|
|
2332
|
-
╞═════════════╪═════╪══════════╡
|
|
2333
|
-
│ pear ┆ 3 ┆ pea │
|
|
2334
|
-
│ null ┆ 4 ┆ null │
|
|
2335
|
-
│ papaya ┆ -2 ┆ papa │
|
|
2336
|
-
│ dragonfruit ┆ -5 ┆ dragon │
|
|
2337
|
-
└─────────────┴─────┴──────────┘
|
|
2338
|
-
"""
|
|
2339
|
-
n_pyexpr = parse_into_expression(n)
|
|
2340
|
-
return wrap_expr(self._pyexpr.str_head(n_pyexpr))
|
|
2341
|
-
|
|
2342
|
-
def tail(self, n: int | IntoExprColumn) -> Expr:
|
|
2343
|
-
"""
|
|
2344
|
-
Return the last n characters of each string in a String Series.
|
|
2345
|
-
|
|
2346
|
-
Parameters
|
|
2347
|
-
----------
|
|
2348
|
-
n
|
|
2349
|
-
Length of the slice (integer or expression). Negative indexing is supported;
|
|
2350
|
-
see note (2) below.
|
|
2351
|
-
|
|
2352
|
-
Returns
|
|
2353
|
-
-------
|
|
2354
|
-
Expr
|
|
2355
|
-
Expression of data type :class:`String`.
|
|
2356
|
-
|
|
2357
|
-
Notes
|
|
2358
|
-
-----
|
|
2359
|
-
1) The `n` input is defined in terms of the number of characters in the (UTF8)
|
|
2360
|
-
string. A character is defined as a `Unicode scalar value`_. A single
|
|
2361
|
-
character is represented by a single byte when working with ASCII text, and a
|
|
2362
|
-
maximum of 4 bytes otherwise.
|
|
2363
|
-
|
|
2364
|
-
.. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value
|
|
2365
|
-
|
|
2366
|
-
2) When the `n` input is negative, `tail` returns characters starting from the
|
|
2367
|
-
`n`th from the beginning of the string. For example, if `n = -3`, then all
|
|
2368
|
-
characters except the first three are returned.
|
|
2369
|
-
|
|
2370
|
-
3) If the length of the string has fewer than `n` characters, the full string is
|
|
2371
|
-
returned.
|
|
2372
|
-
|
|
2373
|
-
Examples
|
|
2374
|
-
--------
|
|
2375
|
-
Return up to the last 5 characters:
|
|
2376
|
-
|
|
2377
|
-
>>> df = pl.DataFrame({"s": ["pear", None, "papaya", "dragonfruit"]})
|
|
2378
|
-
>>> df.with_columns(pl.col("s").str.tail(5).alias("s_tail_5"))
|
|
2379
|
-
shape: (4, 2)
|
|
2380
|
-
┌─────────────┬──────────┐
|
|
2381
|
-
│ s ┆ s_tail_5 │
|
|
2382
|
-
│ --- ┆ --- │
|
|
2383
|
-
│ str ┆ str │
|
|
2384
|
-
╞═════════════╪══════════╡
|
|
2385
|
-
│ pear ┆ pear │
|
|
2386
|
-
│ null ┆ null │
|
|
2387
|
-
│ papaya ┆ apaya │
|
|
2388
|
-
│ dragonfruit ┆ fruit │
|
|
2389
|
-
└─────────────┴──────────┘
|
|
2390
|
-
|
|
2391
|
-
Return characters determined by column `n`:
|
|
2392
|
-
|
|
2393
|
-
>>> df = pl.DataFrame(
|
|
2394
|
-
... {
|
|
2395
|
-
... "s": ["pear", None, "papaya", "dragonfruit"],
|
|
2396
|
-
... "n": [3, 4, -2, -5],
|
|
2397
|
-
... }
|
|
2398
|
-
... )
|
|
2399
|
-
>>> df.with_columns(pl.col("s").str.tail("n").alias("s_tail_n"))
|
|
2400
|
-
shape: (4, 3)
|
|
2401
|
-
┌─────────────┬─────┬──────────┐
|
|
2402
|
-
│ s ┆ n ┆ s_tail_n │
|
|
2403
|
-
│ --- ┆ --- ┆ --- │
|
|
2404
|
-
│ str ┆ i64 ┆ str │
|
|
2405
|
-
╞═════════════╪═════╪══════════╡
|
|
2406
|
-
│ pear ┆ 3 ┆ ear │
|
|
2407
|
-
│ null ┆ 4 ┆ null │
|
|
2408
|
-
│ papaya ┆ -2 ┆ paya │
|
|
2409
|
-
│ dragonfruit ┆ -5 ┆ nfruit │
|
|
2410
|
-
└─────────────┴─────┴──────────┘
|
|
2411
|
-
"""
|
|
2412
|
-
n_pyexpr = parse_into_expression(n)
|
|
2413
|
-
return wrap_expr(self._pyexpr.str_tail(n_pyexpr))
|
|
2414
|
-
|
|
2415
|
-
@deprecated(
|
|
2416
|
-
'`str.explode` is deprecated; use `str.split("").explode()` instead.'
|
|
2417
|
-
" Note that empty strings will result in null instead of being preserved."
|
|
2418
|
-
" To get the exact same behavior, split first and then use a `pl.when...then...otherwise`"
|
|
2419
|
-
" expression to handle the empty list before exploding."
|
|
2420
|
-
)
|
|
2421
|
-
def explode(self) -> Expr:
|
|
2422
|
-
"""
|
|
2423
|
-
Returns a column with a separate row for every string character.
|
|
2424
|
-
|
|
2425
|
-
.. deprecated:: 0.20.31
|
|
2426
|
-
Use the `.str.split("").explode()` method instead. Note that empty strings
|
|
2427
|
-
will result in null instead of being preserved. To get the exact same
|
|
2428
|
-
behavior, split first and then use a `pl.when...then...otherwise`
|
|
2429
|
-
expression to handle the empty list before exploding.
|
|
2430
|
-
|
|
2431
|
-
Returns
|
|
2432
|
-
-------
|
|
2433
|
-
Expr
|
|
2434
|
-
Expression of data type :class:`String`.
|
|
2435
|
-
|
|
2436
|
-
Examples
|
|
2437
|
-
--------
|
|
2438
|
-
>>> df = pl.DataFrame({"a": ["foo", "bar"]})
|
|
2439
|
-
>>> df.select(pl.col("a").str.explode()) # doctest: +SKIP
|
|
2440
|
-
shape: (6, 1)
|
|
2441
|
-
┌─────┐
|
|
2442
|
-
│ a │
|
|
2443
|
-
│ --- │
|
|
2444
|
-
│ str │
|
|
2445
|
-
╞═════╡
|
|
2446
|
-
│ f │
|
|
2447
|
-
│ o │
|
|
2448
|
-
│ o │
|
|
2449
|
-
│ b │
|
|
2450
|
-
│ a │
|
|
2451
|
-
│ r │
|
|
2452
|
-
└─────┘
|
|
2453
|
-
"""
|
|
2454
|
-
split = self.split("")
|
|
2455
|
-
return F.when(split.ne_missing([])).then(split).otherwise([""]).explode()
|
|
2456
|
-
|
|
2457
|
-
def to_integer(
|
|
2458
|
-
self,
|
|
2459
|
-
*,
|
|
2460
|
-
base: int | IntoExprColumn = 10,
|
|
2461
|
-
dtype: PolarsIntegerType = Int64,
|
|
2462
|
-
strict: bool = True,
|
|
2463
|
-
) -> Expr:
|
|
2464
|
-
"""
|
|
2465
|
-
Convert a String column into an Int64 column with base radix.
|
|
2466
|
-
|
|
2467
|
-
Parameters
|
|
2468
|
-
----------
|
|
2469
|
-
base
|
|
2470
|
-
Positive integer or expression which is the base of the string
|
|
2471
|
-
we are parsing.
|
|
2472
|
-
Default: 10.
|
|
2473
|
-
strict
|
|
2474
|
-
Bool, Default=True will raise any ParseError or overflow as ComputeError.
|
|
2475
|
-
False silently convert to Null.
|
|
2476
|
-
|
|
2477
|
-
Returns
|
|
2478
|
-
-------
|
|
2479
|
-
Expr
|
|
2480
|
-
Expression of data type :class:`Int64`.
|
|
2481
|
-
|
|
2482
|
-
Examples
|
|
2483
|
-
--------
|
|
2484
|
-
>>> df = pl.DataFrame({"bin": ["110", "101", "010", "invalid"]})
|
|
2485
|
-
>>> df.with_columns(
|
|
2486
|
-
... parsed=pl.col("bin").str.to_integer(
|
|
2487
|
-
... base=2, dtype=pl.Int32, strict=False
|
|
2488
|
-
... )
|
|
2489
|
-
... )
|
|
2490
|
-
shape: (4, 2)
|
|
2491
|
-
┌─────────┬────────┐
|
|
2492
|
-
│ bin ┆ parsed │
|
|
2493
|
-
│ --- ┆ --- │
|
|
2494
|
-
│ str ┆ i32 │
|
|
2495
|
-
╞═════════╪════════╡
|
|
2496
|
-
│ 110 ┆ 6 │
|
|
2497
|
-
│ 101 ┆ 5 │
|
|
2498
|
-
│ 010 ┆ 2 │
|
|
2499
|
-
│ invalid ┆ null │
|
|
2500
|
-
└─────────┴────────┘
|
|
2501
|
-
|
|
2502
|
-
>>> df = pl.DataFrame({"hex": ["fa1e", "ff00", "cafe", None]})
|
|
2503
|
-
>>> df.with_columns(parsed=pl.col("hex").str.to_integer(base=16, strict=True))
|
|
2504
|
-
shape: (4, 2)
|
|
2505
|
-
┌──────┬────────┐
|
|
2506
|
-
│ hex ┆ parsed │
|
|
2507
|
-
│ --- ┆ --- │
|
|
2508
|
-
│ str ┆ i64 │
|
|
2509
|
-
╞══════╪════════╡
|
|
2510
|
-
│ fa1e ┆ 64030 │
|
|
2511
|
-
│ ff00 ┆ 65280 │
|
|
2512
|
-
│ cafe ┆ 51966 │
|
|
2513
|
-
│ null ┆ null │
|
|
2514
|
-
└──────┴────────┘
|
|
2515
|
-
"""
|
|
2516
|
-
base_pyexpr = parse_into_expression(base, str_as_lit=False)
|
|
2517
|
-
return wrap_expr(self._pyexpr.str_to_integer(base_pyexpr, dtype, strict))
|
|
2518
|
-
|
|
2519
|
-
def contains_any(
|
|
2520
|
-
self, patterns: IntoExpr, *, ascii_case_insensitive: bool = False
|
|
2521
|
-
) -> Expr:
|
|
2522
|
-
"""
|
|
2523
|
-
Use the Aho-Corasick algorithm to find matches.
|
|
2524
|
-
|
|
2525
|
-
Determines if any of the patterns are contained in the string.
|
|
2526
|
-
|
|
2527
|
-
Parameters
|
|
2528
|
-
----------
|
|
2529
|
-
patterns
|
|
2530
|
-
String patterns to search.
|
|
2531
|
-
ascii_case_insensitive
|
|
2532
|
-
Enable ASCII-aware case-insensitive matching.
|
|
2533
|
-
When this option is enabled, searching will be performed without respect
|
|
2534
|
-
to case for ASCII letters (a-z and A-Z) only.
|
|
2535
|
-
|
|
2536
|
-
Notes
|
|
2537
|
-
-----
|
|
2538
|
-
This method supports matching on string literals only, and does not support
|
|
2539
|
-
regular expression matching.
|
|
2540
|
-
|
|
2541
|
-
Examples
|
|
2542
|
-
--------
|
|
2543
|
-
>>> _ = pl.Config.set_fmt_str_lengths(100)
|
|
2544
|
-
>>> df = pl.DataFrame(
|
|
2545
|
-
... {
|
|
2546
|
-
... "lyrics": [
|
|
2547
|
-
... "Everybody wants to rule the world",
|
|
2548
|
-
... "Tell me what you want, what you really really want",
|
|
2549
|
-
... "Can you feel the love tonight",
|
|
2550
|
-
... ]
|
|
2551
|
-
... }
|
|
2552
|
-
... )
|
|
2553
|
-
>>> df.with_columns(
|
|
2554
|
-
... pl.col("lyrics").str.contains_any(["you", "me"]).alias("contains_any")
|
|
2555
|
-
... )
|
|
2556
|
-
shape: (3, 2)
|
|
2557
|
-
┌────────────────────────────────────────────────────┬──────────────┐
|
|
2558
|
-
│ lyrics ┆ contains_any │
|
|
2559
|
-
│ --- ┆ --- │
|
|
2560
|
-
│ str ┆ bool │
|
|
2561
|
-
╞════════════════════════════════════════════════════╪══════════════╡
|
|
2562
|
-
│ Everybody wants to rule the world ┆ false │
|
|
2563
|
-
│ Tell me what you want, what you really really want ┆ true │
|
|
2564
|
-
│ Can you feel the love tonight ┆ true │
|
|
2565
|
-
└────────────────────────────────────────────────────┴──────────────┘
|
|
2566
|
-
"""
|
|
2567
|
-
patterns_pyexpr = parse_into_expression(patterns, str_as_lit=False)
|
|
2568
|
-
return wrap_expr(
|
|
2569
|
-
self._pyexpr.str_contains_any(patterns_pyexpr, ascii_case_insensitive)
|
|
2570
|
-
)
|
|
2571
|
-
|
|
2572
|
-
def replace_many(
|
|
2573
|
-
self,
|
|
2574
|
-
patterns: IntoExpr | Mapping[str, str],
|
|
2575
|
-
replace_with: IntoExpr | NoDefault = no_default,
|
|
2576
|
-
*,
|
|
2577
|
-
ascii_case_insensitive: bool = False,
|
|
2578
|
-
) -> Expr:
|
|
2579
|
-
"""
|
|
2580
|
-
Use the Aho-Corasick algorithm to replace many matches.
|
|
2581
|
-
|
|
2582
|
-
Parameters
|
|
2583
|
-
----------
|
|
2584
|
-
patterns
|
|
2585
|
-
String patterns to search and replace.
|
|
2586
|
-
Accepts expression input. Strings are parsed as column names, and other
|
|
2587
|
-
non-expression inputs are parsed as literals. Also accepts a mapping of
|
|
2588
|
-
patterns to their replacement as syntactic sugar for
|
|
2589
|
-
`replace_many(pl.Series(mapping.keys()), pl.Series(mapping.values()))`.
|
|
2590
|
-
replace_with
|
|
2591
|
-
Strings to replace where a pattern was a match.
|
|
2592
|
-
Accepts expression input. Non-expression inputs are parsed as literals.
|
|
2593
|
-
Length must match the length of `patterns` or have length 1. This can be
|
|
2594
|
-
broadcasted, so it supports many:one and many:many.
|
|
2595
|
-
ascii_case_insensitive
|
|
2596
|
-
Enable ASCII-aware case-insensitive matching.
|
|
2597
|
-
When this option is enabled, searching will be performed without respect
|
|
2598
|
-
to case for ASCII letters (a-z and A-Z) only.
|
|
2599
|
-
|
|
2600
|
-
Notes
|
|
2601
|
-
-----
|
|
2602
|
-
This method supports matching on string literals only, and does not support
|
|
2603
|
-
regular expression matching.
|
|
2604
|
-
|
|
2605
|
-
Examples
|
|
2606
|
-
--------
|
|
2607
|
-
Replace many patterns by passing sequences of equal length to the `patterns` and
|
|
2608
|
-
`replace_with` parameters.
|
|
2609
|
-
|
|
2610
|
-
>>> _ = pl.Config.set_fmt_str_lengths(100)
|
|
2611
|
-
>>> _ = pl.Config.set_tbl_width_chars(110)
|
|
2612
|
-
>>> df = pl.DataFrame(
|
|
2613
|
-
... {
|
|
2614
|
-
... "lyrics": [
|
|
2615
|
-
... "Everybody wants to rule the world",
|
|
2616
|
-
... "Tell me what you want, what you really really want",
|
|
2617
|
-
... "Can you feel the love tonight",
|
|
2618
|
-
... ]
|
|
2619
|
-
... }
|
|
2620
|
-
... )
|
|
2621
|
-
>>> df.with_columns(
|
|
2622
|
-
... pl.col("lyrics")
|
|
2623
|
-
... .str.replace_many(
|
|
2624
|
-
... ["me", "you"],
|
|
2625
|
-
... ["you", "me"],
|
|
2626
|
-
... )
|
|
2627
|
-
... .alias("confusing")
|
|
2628
|
-
... )
|
|
2629
|
-
shape: (3, 2)
|
|
2630
|
-
┌────────────────────────────────────────────────────┬───────────────────────────────────────────────────┐
|
|
2631
|
-
│ lyrics ┆ confusing │
|
|
2632
|
-
│ --- ┆ --- │
|
|
2633
|
-
│ str ┆ str │
|
|
2634
|
-
╞════════════════════════════════════════════════════╪═══════════════════════════════════════════════════╡
|
|
2635
|
-
│ Everybody wants to rule the world ┆ Everybody wants to rule the world │
|
|
2636
|
-
│ Tell me what you want, what you really really want ┆ Tell you what me want, what me really really want │
|
|
2637
|
-
│ Can you feel the love tonight ┆ Can me feel the love tonight │
|
|
2638
|
-
└────────────────────────────────────────────────────┴───────────────────────────────────────────────────┘
|
|
2639
|
-
|
|
2640
|
-
Broadcast a replacement for many patterns by passing sequence of length 1 to the
|
|
2641
|
-
`replace_with` parameter.
|
|
2642
|
-
|
|
2643
|
-
>>> _ = pl.Config.set_fmt_str_lengths(100)
|
|
2644
|
-
>>> df = pl.DataFrame(
|
|
2645
|
-
... {
|
|
2646
|
-
... "lyrics": [
|
|
2647
|
-
... "Everybody wants to rule the world",
|
|
2648
|
-
... "Tell me what you want, what you really really want",
|
|
2649
|
-
... "Can you feel the love tonight",
|
|
2650
|
-
... ]
|
|
2651
|
-
... }
|
|
2652
|
-
... )
|
|
2653
|
-
>>> df.with_columns(
|
|
2654
|
-
... pl.col("lyrics")
|
|
2655
|
-
... .str.replace_many(
|
|
2656
|
-
... ["me", "you", "they"],
|
|
2657
|
-
... [""],
|
|
2658
|
-
... )
|
|
2659
|
-
... .alias("removes_pronouns")
|
|
2660
|
-
... )
|
|
2661
|
-
shape: (3, 2)
|
|
2662
|
-
┌────────────────────────────────────────────────────┬────────────────────────────────────────────┐
|
|
2663
|
-
│ lyrics ┆ removes_pronouns │
|
|
2664
|
-
│ --- ┆ --- │
|
|
2665
|
-
│ str ┆ str │
|
|
2666
|
-
╞════════════════════════════════════════════════════╪════════════════════════════════════════════╡
|
|
2667
|
-
│ Everybody wants to rule the world ┆ Everybody wants to rule the world │
|
|
2668
|
-
│ Tell me what you want, what you really really want ┆ Tell what want, what really really want │
|
|
2669
|
-
│ Can you feel the love tonight ┆ Can feel the love tonight │
|
|
2670
|
-
└────────────────────────────────────────────────────┴────────────────────────────────────────────┘
|
|
2671
|
-
|
|
2672
|
-
Passing a mapping with patterns and replacements is also supported as syntactic
|
|
2673
|
-
sugar.
|
|
2674
|
-
|
|
2675
|
-
>>> _ = pl.Config.set_fmt_str_lengths(100)
|
|
2676
|
-
>>> _ = pl.Config.set_tbl_width_chars(110)
|
|
2677
|
-
>>> df = pl.DataFrame(
|
|
2678
|
-
... {
|
|
2679
|
-
... "lyrics": [
|
|
2680
|
-
... "Everybody wants to rule the world",
|
|
2681
|
-
... "Tell me what you want, what you really really want",
|
|
2682
|
-
... "Can you feel the love tonight",
|
|
2683
|
-
... ]
|
|
2684
|
-
... }
|
|
2685
|
-
... )
|
|
2686
|
-
>>> mapping = {"me": "you", "you": "me", "want": "need"}
|
|
2687
|
-
>>> df.with_columns(
|
|
2688
|
-
... pl.col("lyrics").str.replace_many(mapping).alias("confusing")
|
|
2689
|
-
... )
|
|
2690
|
-
shape: (3, 2)
|
|
2691
|
-
┌────────────────────────────────────────────────────┬───────────────────────────────────────────────────┐
|
|
2692
|
-
│ lyrics ┆ confusing │
|
|
2693
|
-
│ --- ┆ --- │
|
|
2694
|
-
│ str ┆ str │
|
|
2695
|
-
╞════════════════════════════════════════════════════╪═══════════════════════════════════════════════════╡
|
|
2696
|
-
│ Everybody wants to rule the world ┆ Everybody needs to rule the world │
|
|
2697
|
-
│ Tell me what you want, what you really really want ┆ Tell you what me need, what me really really need │
|
|
2698
|
-
│ Can you feel the love tonight ┆ Can me feel the love tonight │
|
|
2699
|
-
└────────────────────────────────────────────────────┴───────────────────────────────────────────────────┘
|
|
2700
|
-
""" # noqa: W505
|
|
2701
|
-
if replace_with is no_default:
|
|
2702
|
-
if not isinstance(patterns, Mapping):
|
|
2703
|
-
msg = "`replace_with` argument is required if `patterns` argument is not a Mapping type"
|
|
2704
|
-
raise TypeError(msg)
|
|
2705
|
-
# Early return in case of an empty mapping.
|
|
2706
|
-
if not patterns:
|
|
2707
|
-
return wrap_expr(self._pyexpr)
|
|
2708
|
-
replace_with = list(patterns.values())
|
|
2709
|
-
patterns = list(patterns.keys())
|
|
2710
|
-
|
|
2711
|
-
patterns_pyexpr = parse_into_expression(
|
|
2712
|
-
patterns, # type: ignore[arg-type]
|
|
2713
|
-
str_as_lit=False,
|
|
2714
|
-
)
|
|
2715
|
-
replace_with_pyexpr = parse_into_expression(replace_with, str_as_lit=True)
|
|
2716
|
-
return wrap_expr(
|
|
2717
|
-
self._pyexpr.str_replace_many(
|
|
2718
|
-
patterns_pyexpr, replace_with_pyexpr, ascii_case_insensitive
|
|
2719
|
-
)
|
|
2720
|
-
)
|
|
2721
|
-
|
|
2722
|
-
@unstable()
|
|
2723
|
-
def extract_many(
|
|
2724
|
-
self,
|
|
2725
|
-
patterns: IntoExpr,
|
|
2726
|
-
*,
|
|
2727
|
-
ascii_case_insensitive: bool = False,
|
|
2728
|
-
overlapping: bool = False,
|
|
2729
|
-
) -> Expr:
|
|
2730
|
-
"""
|
|
2731
|
-
Use the Aho-Corasick algorithm to extract many matches.
|
|
2732
|
-
|
|
2733
|
-
Parameters
|
|
2734
|
-
----------
|
|
2735
|
-
patterns
|
|
2736
|
-
String patterns to search.
|
|
2737
|
-
ascii_case_insensitive
|
|
2738
|
-
Enable ASCII-aware case-insensitive matching.
|
|
2739
|
-
When this option is enabled, searching will be performed without respect
|
|
2740
|
-
to case for ASCII letters (a-z and A-Z) only.
|
|
2741
|
-
overlapping
|
|
2742
|
-
Whether matches may overlap.
|
|
2743
|
-
|
|
2744
|
-
Notes
|
|
2745
|
-
-----
|
|
2746
|
-
This method supports matching on string literals only, and does not support
|
|
2747
|
-
regular expression matching.
|
|
2748
|
-
|
|
2749
|
-
Examples
|
|
2750
|
-
--------
|
|
2751
|
-
>>> _ = pl.Config.set_fmt_str_lengths(100)
|
|
2752
|
-
>>> df = pl.DataFrame({"values": ["discontent"]})
|
|
2753
|
-
>>> patterns = ["winter", "disco", "onte", "discontent"]
|
|
2754
|
-
>>> df.with_columns(
|
|
2755
|
-
... pl.col("values")
|
|
2756
|
-
... .str.extract_many(patterns, overlapping=False)
|
|
2757
|
-
... .alias("matches"),
|
|
2758
|
-
... pl.col("values")
|
|
2759
|
-
... .str.extract_many(patterns, overlapping=True)
|
|
2760
|
-
... .alias("matches_overlapping"),
|
|
2761
|
-
... )
|
|
2762
|
-
shape: (1, 3)
|
|
2763
|
-
┌────────────┬───────────┬─────────────────────────────────┐
|
|
2764
|
-
│ values ┆ matches ┆ matches_overlapping │
|
|
2765
|
-
│ --- ┆ --- ┆ --- │
|
|
2766
|
-
│ str ┆ list[str] ┆ list[str] │
|
|
2767
|
-
╞════════════╪═══════════╪═════════════════════════════════╡
|
|
2768
|
-
│ discontent ┆ ["disco"] ┆ ["disco", "onte", "discontent"] │
|
|
2769
|
-
└────────────┴───────────┴─────────────────────────────────┘
|
|
2770
|
-
>>> df = pl.DataFrame(
|
|
2771
|
-
... {
|
|
2772
|
-
... "values": ["discontent", "rhapsody"],
|
|
2773
|
-
... "patterns": [
|
|
2774
|
-
... ["winter", "disco", "onte", "discontent"],
|
|
2775
|
-
... ["rhap", "ody", "coalesce"],
|
|
2776
|
-
... ],
|
|
2777
|
-
... }
|
|
2778
|
-
... )
|
|
2779
|
-
>>> df.select(pl.col("values").str.extract_many("patterns"))
|
|
2780
|
-
shape: (2, 1)
|
|
2781
|
-
┌─────────────────┐
|
|
2782
|
-
│ values │
|
|
2783
|
-
│ --- │
|
|
2784
|
-
│ list[str] │
|
|
2785
|
-
╞═════════════════╡
|
|
2786
|
-
│ ["disco"] │
|
|
2787
|
-
│ ["rhap", "ody"] │
|
|
2788
|
-
└─────────────────┘
|
|
2789
|
-
"""
|
|
2790
|
-
patterns_pyexpr = parse_into_expression(patterns, str_as_lit=False)
|
|
2791
|
-
return wrap_expr(
|
|
2792
|
-
self._pyexpr.str_extract_many(
|
|
2793
|
-
patterns_pyexpr, ascii_case_insensitive, overlapping
|
|
2794
|
-
)
|
|
2795
|
-
)
|
|
2796
|
-
|
|
2797
|
-
@unstable()
|
|
2798
|
-
def find_many(
|
|
2799
|
-
self,
|
|
2800
|
-
patterns: IntoExpr,
|
|
2801
|
-
*,
|
|
2802
|
-
ascii_case_insensitive: bool = False,
|
|
2803
|
-
overlapping: bool = False,
|
|
2804
|
-
) -> Expr:
|
|
2805
|
-
"""
|
|
2806
|
-
Use the Aho-Corasick algorithm to find many matches.
|
|
2807
|
-
|
|
2808
|
-
The function will return the bytes offset of the start of each match.
|
|
2809
|
-
The return type will be `List<UInt32>`
|
|
2810
|
-
|
|
2811
|
-
Parameters
|
|
2812
|
-
----------
|
|
2813
|
-
patterns
|
|
2814
|
-
String patterns to search.
|
|
2815
|
-
ascii_case_insensitive
|
|
2816
|
-
Enable ASCII-aware case-insensitive matching.
|
|
2817
|
-
When this option is enabled, searching will be performed without respect
|
|
2818
|
-
to case for ASCII letters (a-z and A-Z) only.
|
|
2819
|
-
overlapping
|
|
2820
|
-
Whether matches may overlap.
|
|
2821
|
-
|
|
2822
|
-
Notes
|
|
2823
|
-
-----
|
|
2824
|
-
This method supports matching on string literals only, and does not support
|
|
2825
|
-
regular expression matching.
|
|
2826
|
-
|
|
2827
|
-
Examples
|
|
2828
|
-
--------
|
|
2829
|
-
>>> _ = pl.Config.set_fmt_str_lengths(100)
|
|
2830
|
-
>>> df = pl.DataFrame({"values": ["discontent"]})
|
|
2831
|
-
>>> patterns = ["winter", "disco", "onte", "discontent"]
|
|
2832
|
-
>>> df.with_columns(
|
|
2833
|
-
... pl.col("values")
|
|
2834
|
-
... .str.find_many(patterns, overlapping=False)
|
|
2835
|
-
... .alias("matches"),
|
|
2836
|
-
... pl.col("values")
|
|
2837
|
-
... .str.find_many(patterns, overlapping=True)
|
|
2838
|
-
... .alias("matches_overlapping"),
|
|
2839
|
-
... )
|
|
2840
|
-
shape: (1, 3)
|
|
2841
|
-
┌────────────┬───────────┬─────────────────────┐
|
|
2842
|
-
│ values ┆ matches ┆ matches_overlapping │
|
|
2843
|
-
│ --- ┆ --- ┆ --- │
|
|
2844
|
-
│ str ┆ list[u32] ┆ list[u32] │
|
|
2845
|
-
╞════════════╪═══════════╪═════════════════════╡
|
|
2846
|
-
│ discontent ┆ [0] ┆ [0, 4, 0] │
|
|
2847
|
-
└────────────┴───────────┴─────────────────────┘
|
|
2848
|
-
>>> df = pl.DataFrame(
|
|
2849
|
-
... {
|
|
2850
|
-
... "values": ["discontent", "rhapsody"],
|
|
2851
|
-
... "patterns": [
|
|
2852
|
-
... ["winter", "disco", "onte", "discontent"],
|
|
2853
|
-
... ["rhap", "ody", "coalesce"],
|
|
2854
|
-
... ],
|
|
2855
|
-
... }
|
|
2856
|
-
... )
|
|
2857
|
-
>>> df.select(pl.col("values").str.find_many("patterns"))
|
|
2858
|
-
shape: (2, 1)
|
|
2859
|
-
┌───────────┐
|
|
2860
|
-
│ values │
|
|
2861
|
-
│ --- │
|
|
2862
|
-
│ list[u32] │
|
|
2863
|
-
╞═══════════╡
|
|
2864
|
-
│ [0] │
|
|
2865
|
-
│ [0, 5] │
|
|
2866
|
-
└───────────┘
|
|
2867
|
-
"""
|
|
2868
|
-
patterns_pyexpr = parse_into_expression(patterns, str_as_lit=False)
|
|
2869
|
-
return wrap_expr(
|
|
2870
|
-
self._pyexpr.str_find_many(
|
|
2871
|
-
patterns_pyexpr, ascii_case_insensitive, overlapping
|
|
2872
|
-
)
|
|
2873
|
-
)
|
|
2874
|
-
|
|
2875
|
-
def join(self, delimiter: str = "", *, ignore_nulls: bool = True) -> Expr:
|
|
2876
|
-
"""
|
|
2877
|
-
Vertically concatenate the string values in the column to a single string value.
|
|
2878
|
-
|
|
2879
|
-
Parameters
|
|
2880
|
-
----------
|
|
2881
|
-
delimiter
|
|
2882
|
-
The delimiter to insert between consecutive string values.
|
|
2883
|
-
ignore_nulls
|
|
2884
|
-
Ignore null values (default).
|
|
2885
|
-
If set to `False`, null values will be propagated. This means that
|
|
2886
|
-
if the column contains any null values, the output is null.
|
|
2887
|
-
|
|
2888
|
-
Returns
|
|
2889
|
-
-------
|
|
2890
|
-
Expr
|
|
2891
|
-
Expression of data type :class:`String`.
|
|
2892
|
-
|
|
2893
|
-
Examples
|
|
2894
|
-
--------
|
|
2895
|
-
>>> df = pl.DataFrame({"foo": [1, None, 3]})
|
|
2896
|
-
>>> df.select(pl.col("foo").str.join("-"))
|
|
2897
|
-
shape: (1, 1)
|
|
2898
|
-
┌─────┐
|
|
2899
|
-
│ foo │
|
|
2900
|
-
│ --- │
|
|
2901
|
-
│ str │
|
|
2902
|
-
╞═════╡
|
|
2903
|
-
│ 1-3 │
|
|
2904
|
-
└─────┘
|
|
2905
|
-
>>> df.select(pl.col("foo").str.join(ignore_nulls=False))
|
|
2906
|
-
shape: (1, 1)
|
|
2907
|
-
┌──────┐
|
|
2908
|
-
│ foo │
|
|
2909
|
-
│ --- │
|
|
2910
|
-
│ str │
|
|
2911
|
-
╞══════╡
|
|
2912
|
-
│ null │
|
|
2913
|
-
└──────┘
|
|
2914
|
-
"""
|
|
2915
|
-
return wrap_expr(self._pyexpr.str_join(delimiter, ignore_nulls=ignore_nulls))
|
|
2916
|
-
|
|
2917
|
-
@deprecated(
|
|
2918
|
-
"`str.concat` is deprecated; use `str.join` instead. Note also that the "
|
|
2919
|
-
"default `delimiter` for `str.join` is an empty string, not a hyphen."
|
|
2920
|
-
)
|
|
2921
|
-
def concat(
|
|
2922
|
-
self, delimiter: str | None = None, *, ignore_nulls: bool = True
|
|
2923
|
-
) -> Expr:
|
|
2924
|
-
"""
|
|
2925
|
-
Vertically concatenate the string values in the column to a single string value.
|
|
2926
|
-
|
|
2927
|
-
.. deprecated:: 1.0.0
|
|
2928
|
-
Use :meth:`join` instead. Note that the default `delimiter` for :meth:`join`
|
|
2929
|
-
is an empty string instead of a hyphen.
|
|
2930
|
-
|
|
2931
|
-
Parameters
|
|
2932
|
-
----------
|
|
2933
|
-
delimiter
|
|
2934
|
-
The delimiter to insert between consecutive string values.
|
|
2935
|
-
ignore_nulls
|
|
2936
|
-
Ignore null values (default).
|
|
2937
|
-
If set to `False`, null values will be propagated. This means that
|
|
2938
|
-
if the column contains any null values, the output is null.
|
|
2939
|
-
|
|
2940
|
-
Returns
|
|
2941
|
-
-------
|
|
2942
|
-
Expr
|
|
2943
|
-
Expression of data type :class:`String`.
|
|
2944
|
-
|
|
2945
|
-
Examples
|
|
2946
|
-
--------
|
|
2947
|
-
>>> df = pl.DataFrame({"foo": [1, None, 2]})
|
|
2948
|
-
>>> df.select(pl.col("foo").str.concat("-")) # doctest: +SKIP
|
|
2949
|
-
shape: (1, 1)
|
|
2950
|
-
┌─────┐
|
|
2951
|
-
│ foo │
|
|
2952
|
-
│ --- │
|
|
2953
|
-
│ str │
|
|
2954
|
-
╞═════╡
|
|
2955
|
-
│ 1-2 │
|
|
2956
|
-
└─────┘
|
|
2957
|
-
>>> df.select(
|
|
2958
|
-
... pl.col("foo").str.concat("-", ignore_nulls=False)
|
|
2959
|
-
... ) # doctest: +SKIP
|
|
2960
|
-
shape: (1, 1)
|
|
2961
|
-
┌──────┐
|
|
2962
|
-
│ foo │
|
|
2963
|
-
│ --- │
|
|
2964
|
-
│ str │
|
|
2965
|
-
╞══════╡
|
|
2966
|
-
│ null │
|
|
2967
|
-
└──────┘
|
|
2968
|
-
"""
|
|
2969
|
-
if delimiter is None:
|
|
2970
|
-
delimiter = "-"
|
|
2971
|
-
return self.join(delimiter, ignore_nulls=ignore_nulls)
|
|
2972
|
-
|
|
2973
|
-
def escape_regex(self) -> Expr:
|
|
2974
|
-
r"""
|
|
2975
|
-
Returns string values with all regular expression meta characters escaped.
|
|
2976
|
-
|
|
2977
|
-
Examples
|
|
2978
|
-
--------
|
|
2979
|
-
>>> df = pl.DataFrame({"text": ["abc", "def", None, "abc(\\w+)"]})
|
|
2980
|
-
>>> df.with_columns(pl.col("text").str.escape_regex().alias("escaped"))
|
|
2981
|
-
shape: (4, 2)
|
|
2982
|
-
┌──────────┬──────────────┐
|
|
2983
|
-
│ text ┆ escaped │
|
|
2984
|
-
│ --- ┆ --- │
|
|
2985
|
-
│ str ┆ str │
|
|
2986
|
-
╞══════════╪══════════════╡
|
|
2987
|
-
│ abc ┆ abc │
|
|
2988
|
-
│ def ┆ def │
|
|
2989
|
-
│ null ┆ null │
|
|
2990
|
-
│ abc(\w+) ┆ abc\(\\w\+\) │
|
|
2991
|
-
└──────────┴──────────────┘
|
|
2992
|
-
"""
|
|
2993
|
-
return wrap_expr(self._pyexpr.str_escape_regex())
|
|
2994
|
-
|
|
2995
|
-
def normalize(self, form: UnicodeForm = "NFC") -> Expr:
|
|
2996
|
-
"""
|
|
2997
|
-
Returns the Unicode normal form of the string values.
|
|
2998
|
-
|
|
2999
|
-
This uses the forms described in Unicode Standard Annex 15: <https://www.unicode.org/reports/tr15/>.
|
|
3000
|
-
|
|
3001
|
-
Parameters
|
|
3002
|
-
----------
|
|
3003
|
-
form : {'NFC', 'NFKC', 'NFD', 'NFKD'}
|
|
3004
|
-
Unicode form to use.
|
|
3005
|
-
|
|
3006
|
-
Examples
|
|
3007
|
-
--------
|
|
3008
|
-
>>> df = pl.DataFrame({"text": ["01²", "KADOKAWA"]})
|
|
3009
|
-
>>> new = df.with_columns(
|
|
3010
|
-
... nfc=pl.col("text").str.normalize("NFC"),
|
|
3011
|
-
... nfkc=pl.col("text").str.normalize("NFKC"),
|
|
3012
|
-
... )
|
|
3013
|
-
>>> new
|
|
3014
|
-
shape: (2, 3)
|
|
3015
|
-
┌──────────────────┬──────────────────┬──────────┐
|
|
3016
|
-
│ text ┆ nfc ┆ nfkc │
|
|
3017
|
-
│ --- ┆ --- ┆ --- │
|
|
3018
|
-
│ str ┆ str ┆ str │
|
|
3019
|
-
╞══════════════════╪══════════════════╪══════════╡
|
|
3020
|
-
│ 01² ┆ 01² ┆ 012 │
|
|
3021
|
-
│ KADOKAWA ┆ KADOKAWA ┆ KADOKAWA │
|
|
3022
|
-
└──────────────────┴──────────────────┴──────────┘
|
|
3023
|
-
>>> new.select(pl.all().str.len_bytes())
|
|
3024
|
-
shape: (2, 3)
|
|
3025
|
-
┌──────┬─────┬──────┐
|
|
3026
|
-
│ text ┆ nfc ┆ nfkc │
|
|
3027
|
-
│ --- ┆ --- ┆ --- │
|
|
3028
|
-
│ u32 ┆ u32 ┆ u32 │
|
|
3029
|
-
╞══════╪═════╪══════╡
|
|
3030
|
-
│ 4 ┆ 4 ┆ 3 │
|
|
3031
|
-
│ 24 ┆ 24 ┆ 8 │
|
|
3032
|
-
└──────┴─────┴──────┘
|
|
3033
|
-
""" # noqa: RUF002
|
|
3034
|
-
return wrap_expr(self._pyexpr.str_normalize(form))
|
|
3035
|
-
|
|
3036
|
-
|
|
3037
|
-
def _validate_format_argument(format: str | None) -> None:
|
|
3038
|
-
if format is not None and ".%f" in format:
|
|
3039
|
-
message = (
|
|
3040
|
-
"Detected the pattern `.%f` in the chrono format string."
|
|
3041
|
-
" This pattern should not be used to parse values after a decimal point."
|
|
3042
|
-
" Use `%.f` instead."
|
|
3043
|
-
" See the full specification: https://docs.rs/chrono/latest/chrono/format/strftime"
|
|
3044
|
-
)
|
|
3045
|
-
warnings.warn(message, ChronoFormatWarning, stacklevel=find_stacklevel())
|