polars-runtime-compat 1.34.0b3__cp39-abi3-macosx_11_0_arm64.whl → 1.34.0b4__cp39-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of polars-runtime-compat might be problematic. Click here for more details.

Files changed (203) hide show
  1. _polars_runtime_compat/_polars_runtime_compat.abi3.so +0 -0
  2. {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b4.dist-info}/METADATA +1 -1
  3. polars_runtime_compat-1.34.0b4.dist-info/RECORD +6 -0
  4. polars/__init__.py +0 -528
  5. polars/_cpu_check.py +0 -265
  6. polars/_dependencies.py +0 -355
  7. polars/_plr.py +0 -99
  8. polars/_plr.pyi +0 -2496
  9. polars/_reexport.py +0 -23
  10. polars/_typing.py +0 -478
  11. polars/_utils/__init__.py +0 -37
  12. polars/_utils/async_.py +0 -102
  13. polars/_utils/cache.py +0 -176
  14. polars/_utils/cloud.py +0 -40
  15. polars/_utils/constants.py +0 -29
  16. polars/_utils/construction/__init__.py +0 -46
  17. polars/_utils/construction/dataframe.py +0 -1397
  18. polars/_utils/construction/other.py +0 -72
  19. polars/_utils/construction/series.py +0 -560
  20. polars/_utils/construction/utils.py +0 -118
  21. polars/_utils/convert.py +0 -224
  22. polars/_utils/deprecation.py +0 -406
  23. polars/_utils/getitem.py +0 -457
  24. polars/_utils/logging.py +0 -11
  25. polars/_utils/nest_asyncio.py +0 -264
  26. polars/_utils/parquet.py +0 -15
  27. polars/_utils/parse/__init__.py +0 -12
  28. polars/_utils/parse/expr.py +0 -242
  29. polars/_utils/polars_version.py +0 -19
  30. polars/_utils/pycapsule.py +0 -53
  31. polars/_utils/scan.py +0 -27
  32. polars/_utils/serde.py +0 -63
  33. polars/_utils/slice.py +0 -215
  34. polars/_utils/udfs.py +0 -1251
  35. polars/_utils/unstable.py +0 -63
  36. polars/_utils/various.py +0 -782
  37. polars/_utils/wrap.py +0 -25
  38. polars/api.py +0 -370
  39. polars/catalog/__init__.py +0 -0
  40. polars/catalog/unity/__init__.py +0 -19
  41. polars/catalog/unity/client.py +0 -733
  42. polars/catalog/unity/models.py +0 -152
  43. polars/config.py +0 -1571
  44. polars/convert/__init__.py +0 -25
  45. polars/convert/general.py +0 -1046
  46. polars/convert/normalize.py +0 -261
  47. polars/dataframe/__init__.py +0 -5
  48. polars/dataframe/_html.py +0 -186
  49. polars/dataframe/frame.py +0 -12582
  50. polars/dataframe/group_by.py +0 -1067
  51. polars/dataframe/plotting.py +0 -257
  52. polars/datatype_expr/__init__.py +0 -5
  53. polars/datatype_expr/array.py +0 -56
  54. polars/datatype_expr/datatype_expr.py +0 -304
  55. polars/datatype_expr/list.py +0 -18
  56. polars/datatype_expr/struct.py +0 -69
  57. polars/datatypes/__init__.py +0 -122
  58. polars/datatypes/_parse.py +0 -195
  59. polars/datatypes/_utils.py +0 -48
  60. polars/datatypes/classes.py +0 -1213
  61. polars/datatypes/constants.py +0 -11
  62. polars/datatypes/constructor.py +0 -172
  63. polars/datatypes/convert.py +0 -366
  64. polars/datatypes/group.py +0 -130
  65. polars/exceptions.py +0 -230
  66. polars/expr/__init__.py +0 -7
  67. polars/expr/array.py +0 -964
  68. polars/expr/binary.py +0 -346
  69. polars/expr/categorical.py +0 -306
  70. polars/expr/datetime.py +0 -2620
  71. polars/expr/expr.py +0 -11272
  72. polars/expr/list.py +0 -1408
  73. polars/expr/meta.py +0 -444
  74. polars/expr/name.py +0 -321
  75. polars/expr/string.py +0 -3045
  76. polars/expr/struct.py +0 -357
  77. polars/expr/whenthen.py +0 -185
  78. polars/functions/__init__.py +0 -193
  79. polars/functions/aggregation/__init__.py +0 -33
  80. polars/functions/aggregation/horizontal.py +0 -298
  81. polars/functions/aggregation/vertical.py +0 -341
  82. polars/functions/as_datatype.py +0 -848
  83. polars/functions/business.py +0 -138
  84. polars/functions/col.py +0 -384
  85. polars/functions/datatype.py +0 -121
  86. polars/functions/eager.py +0 -524
  87. polars/functions/escape_regex.py +0 -29
  88. polars/functions/lazy.py +0 -2751
  89. polars/functions/len.py +0 -68
  90. polars/functions/lit.py +0 -210
  91. polars/functions/random.py +0 -22
  92. polars/functions/range/__init__.py +0 -19
  93. polars/functions/range/_utils.py +0 -15
  94. polars/functions/range/date_range.py +0 -303
  95. polars/functions/range/datetime_range.py +0 -370
  96. polars/functions/range/int_range.py +0 -348
  97. polars/functions/range/linear_space.py +0 -311
  98. polars/functions/range/time_range.py +0 -287
  99. polars/functions/repeat.py +0 -301
  100. polars/functions/whenthen.py +0 -353
  101. polars/interchange/__init__.py +0 -10
  102. polars/interchange/buffer.py +0 -77
  103. polars/interchange/column.py +0 -190
  104. polars/interchange/dataframe.py +0 -230
  105. polars/interchange/from_dataframe.py +0 -328
  106. polars/interchange/protocol.py +0 -303
  107. polars/interchange/utils.py +0 -170
  108. polars/io/__init__.py +0 -64
  109. polars/io/_utils.py +0 -317
  110. polars/io/avro.py +0 -49
  111. polars/io/clipboard.py +0 -36
  112. polars/io/cloud/__init__.py +0 -17
  113. polars/io/cloud/_utils.py +0 -80
  114. polars/io/cloud/credential_provider/__init__.py +0 -17
  115. polars/io/cloud/credential_provider/_builder.py +0 -520
  116. polars/io/cloud/credential_provider/_providers.py +0 -618
  117. polars/io/csv/__init__.py +0 -9
  118. polars/io/csv/_utils.py +0 -38
  119. polars/io/csv/batched_reader.py +0 -142
  120. polars/io/csv/functions.py +0 -1495
  121. polars/io/database/__init__.py +0 -6
  122. polars/io/database/_arrow_registry.py +0 -70
  123. polars/io/database/_cursor_proxies.py +0 -147
  124. polars/io/database/_executor.py +0 -578
  125. polars/io/database/_inference.py +0 -314
  126. polars/io/database/_utils.py +0 -144
  127. polars/io/database/functions.py +0 -516
  128. polars/io/delta.py +0 -499
  129. polars/io/iceberg/__init__.py +0 -3
  130. polars/io/iceberg/_utils.py +0 -697
  131. polars/io/iceberg/dataset.py +0 -556
  132. polars/io/iceberg/functions.py +0 -151
  133. polars/io/ipc/__init__.py +0 -8
  134. polars/io/ipc/functions.py +0 -514
  135. polars/io/json/__init__.py +0 -3
  136. polars/io/json/read.py +0 -101
  137. polars/io/ndjson.py +0 -332
  138. polars/io/parquet/__init__.py +0 -17
  139. polars/io/parquet/field_overwrites.py +0 -140
  140. polars/io/parquet/functions.py +0 -722
  141. polars/io/partition.py +0 -491
  142. polars/io/plugins.py +0 -187
  143. polars/io/pyarrow_dataset/__init__.py +0 -5
  144. polars/io/pyarrow_dataset/anonymous_scan.py +0 -109
  145. polars/io/pyarrow_dataset/functions.py +0 -79
  146. polars/io/scan_options/__init__.py +0 -5
  147. polars/io/scan_options/_options.py +0 -59
  148. polars/io/scan_options/cast_options.py +0 -126
  149. polars/io/spreadsheet/__init__.py +0 -6
  150. polars/io/spreadsheet/_utils.py +0 -52
  151. polars/io/spreadsheet/_write_utils.py +0 -647
  152. polars/io/spreadsheet/functions.py +0 -1323
  153. polars/lazyframe/__init__.py +0 -9
  154. polars/lazyframe/engine_config.py +0 -61
  155. polars/lazyframe/frame.py +0 -8564
  156. polars/lazyframe/group_by.py +0 -669
  157. polars/lazyframe/in_process.py +0 -42
  158. polars/lazyframe/opt_flags.py +0 -333
  159. polars/meta/__init__.py +0 -14
  160. polars/meta/build.py +0 -33
  161. polars/meta/index_type.py +0 -27
  162. polars/meta/thread_pool.py +0 -50
  163. polars/meta/versions.py +0 -120
  164. polars/ml/__init__.py +0 -0
  165. polars/ml/torch.py +0 -213
  166. polars/ml/utilities.py +0 -30
  167. polars/plugins.py +0 -155
  168. polars/py.typed +0 -0
  169. polars/pyproject.toml +0 -103
  170. polars/schema.py +0 -265
  171. polars/selectors.py +0 -3117
  172. polars/series/__init__.py +0 -5
  173. polars/series/array.py +0 -776
  174. polars/series/binary.py +0 -254
  175. polars/series/categorical.py +0 -246
  176. polars/series/datetime.py +0 -2275
  177. polars/series/list.py +0 -1087
  178. polars/series/plotting.py +0 -191
  179. polars/series/series.py +0 -9197
  180. polars/series/string.py +0 -2367
  181. polars/series/struct.py +0 -154
  182. polars/series/utils.py +0 -191
  183. polars/sql/__init__.py +0 -7
  184. polars/sql/context.py +0 -677
  185. polars/sql/functions.py +0 -139
  186. polars/string_cache.py +0 -185
  187. polars/testing/__init__.py +0 -13
  188. polars/testing/asserts/__init__.py +0 -9
  189. polars/testing/asserts/frame.py +0 -231
  190. polars/testing/asserts/series.py +0 -219
  191. polars/testing/asserts/utils.py +0 -12
  192. polars/testing/parametric/__init__.py +0 -33
  193. polars/testing/parametric/profiles.py +0 -107
  194. polars/testing/parametric/strategies/__init__.py +0 -22
  195. polars/testing/parametric/strategies/_utils.py +0 -14
  196. polars/testing/parametric/strategies/core.py +0 -615
  197. polars/testing/parametric/strategies/data.py +0 -452
  198. polars/testing/parametric/strategies/dtype.py +0 -436
  199. polars/testing/parametric/strategies/legacy.py +0 -169
  200. polars/type_aliases.py +0 -24
  201. polars_runtime_compat-1.34.0b3.dist-info/RECORD +0 -203
  202. {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b4.dist-info}/WHEEL +0 -0
  203. {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b4.dist-info}/licenses/LICENSE +0 -0
polars/expr/string.py DELETED
@@ -1,3045 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import warnings
4
- from collections.abc import Mapping
5
- from typing import TYPE_CHECKING
6
-
7
- import polars._reexport as pl
8
- from polars import functions as F
9
- from polars._utils.deprecation import deprecate_nonkeyword_arguments, deprecated
10
- from polars._utils.parse import parse_into_expression
11
- from polars._utils.unstable import unstable
12
- from polars._utils.various import (
13
- find_stacklevel,
14
- issue_warning,
15
- no_default,
16
- qualified_type_name,
17
- )
18
- from polars._utils.wrap import wrap_expr
19
- from polars.datatypes import Date, Datetime, Int64, Time, parse_into_datatype_expr
20
- from polars.exceptions import ChronoFormatWarning
21
-
22
- if TYPE_CHECKING:
23
- import sys
24
-
25
- from polars import Expr
26
- from polars._typing import (
27
- Ambiguous,
28
- IntoExpr,
29
- IntoExprColumn,
30
- PolarsDataType,
31
- PolarsIntegerType,
32
- PolarsTemporalType,
33
- TimeUnit,
34
- TransferEncoding,
35
- UnicodeForm,
36
- )
37
- from polars._utils.various import NoDefault
38
-
39
- if sys.version_info >= (3, 13):
40
- from warnings import deprecated
41
- else:
42
- from typing_extensions import deprecated # noqa: TC004
43
-
44
-
45
- class ExprStringNameSpace:
46
- """Namespace for string related expressions."""
47
-
48
- _accessor = "str"
49
-
50
- def __init__(self, expr: Expr) -> None:
51
- self._pyexpr = expr._pyexpr
52
-
53
- def to_date(
54
- self,
55
- format: str | None = None,
56
- *,
57
- strict: bool = True,
58
- exact: bool = True,
59
- cache: bool = True,
60
- ) -> Expr:
61
- """
62
- Convert a String column into a Date column.
63
-
64
- Parameters
65
- ----------
66
- format
67
- Format to use for conversion. Refer to the `chrono crate documentation
68
- <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
69
- for the full specification. Example: `"%Y-%m-%d"`.
70
- If set to None (default), the format is inferred from the data.
71
- strict
72
- Raise an error if any conversion fails.
73
- exact
74
- Require an exact format match. If False, allow the format to match anywhere
75
- in the target string.
76
-
77
- .. note::
78
- Using `exact=False` introduces a performance penalty - cleaning your
79
- data beforehand will almost certainly be more performant.
80
- cache
81
- Use a cache of unique, converted dates to apply the conversion.
82
-
83
- Examples
84
- --------
85
- >>> s = pl.Series(["2020/01/01", "2020/02/01", "2020/03/01"])
86
- >>> s.str.to_date()
87
- shape: (3,)
88
- Series: '' [date]
89
- [
90
- 2020-01-01
91
- 2020-02-01
92
- 2020-03-01
93
- ]
94
- """
95
- _validate_format_argument(format)
96
- return wrap_expr(self._pyexpr.str_to_date(format, strict, exact, cache))
97
-
98
- def to_datetime(
99
- self,
100
- format: str | None = None,
101
- *,
102
- time_unit: TimeUnit | None = None,
103
- time_zone: str | None = None,
104
- strict: bool = True,
105
- exact: bool = True,
106
- cache: bool = True,
107
- ambiguous: Ambiguous | Expr = "raise",
108
- ) -> Expr:
109
- """
110
- Convert a String column into a Datetime column.
111
-
112
- Parameters
113
- ----------
114
- format
115
- Format to use for conversion. Refer to the `chrono crate documentation
116
- <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
117
- for the full specification. Example: `"%Y-%m-%d %H:%M:%S"`.
118
- If set to None (default), the format is inferred from the data.
119
- time_unit : {None, 'us', 'ns', 'ms'}
120
- Unit of time for the resulting Datetime column. If set to None (default),
121
- the time unit is inferred from the format string if given, eg:
122
- `"%F %T%.3f"` => `Datetime("ms")`. If no fractional second component is
123
- found, the default is `"us"`.
124
- time_zone
125
- Time zone for the resulting Datetime column. Rules are:
126
-
127
- - If inputs are tz-naive and `time_zone` is None, the result time zone is
128
- `None`.
129
- - If inputs are offset-aware and `time_zone` is None, inputs are converted
130
- to `'UTC'` and the result time zone is `'UTC'`.
131
- - If inputs are offset-aware and `time_zone` is given, inputs are converted
132
- to `time_zone` and the result time zone is `time_zone`.
133
- - If inputs are tz-naive and `time_zone` is given, input time zones are
134
- replaced with (not converted to!) `time_zone`, and the result time zone
135
- is `time_zone`.
136
- strict
137
- Raise an error if any conversion fails.
138
- exact
139
- Require an exact format match. If False, allow the format to match anywhere
140
- in the target string.
141
-
142
- .. note::
143
- Using `exact=False` introduces a performance penalty - cleaning your
144
- data beforehand will almost certainly be more performant.
145
- cache
146
- Use a cache of unique, converted datetimes to apply the conversion.
147
- ambiguous
148
- Determine how to deal with ambiguous datetimes:
149
-
150
- - `'raise'` (default): raise
151
- - `'earliest'`: use the earliest datetime
152
- - `'latest'`: use the latest datetime
153
- - `'null'`: set to null
154
-
155
- Examples
156
- --------
157
- >>> s = pl.Series(["2020-01-01 01:00Z", "2020-01-01 02:00Z"])
158
- >>> s.str.to_datetime("%Y-%m-%d %H:%M%#z")
159
- shape: (2,)
160
- Series: '' [datetime[μs, UTC]]
161
- [
162
- 2020-01-01 01:00:00 UTC
163
- 2020-01-01 02:00:00 UTC
164
- ]
165
- """
166
- _validate_format_argument(format)
167
- if not isinstance(ambiguous, pl.Expr):
168
- ambiguous = F.lit(ambiguous)
169
- return wrap_expr(
170
- self._pyexpr.str_to_datetime(
171
- format,
172
- time_unit,
173
- time_zone,
174
- strict,
175
- exact,
176
- cache,
177
- ambiguous._pyexpr,
178
- )
179
- )
180
-
181
- def to_time(
182
- self,
183
- format: str | None = None,
184
- *,
185
- strict: bool = True,
186
- cache: bool = True,
187
- ) -> Expr:
188
- """
189
- Convert a String column into a Time column.
190
-
191
- Parameters
192
- ----------
193
- format
194
- Format to use for conversion. Refer to the `chrono crate documentation
195
- <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
196
- for the full specification. Example: `"%H:%M:%S"`.
197
- If set to None (default), the format is inferred from the data.
198
- strict
199
- Raise an error if any conversion fails.
200
- cache
201
- Use a cache of unique, converted times to apply the conversion.
202
-
203
- Examples
204
- --------
205
- >>> s = pl.Series(["01:00", "02:00", "03:00"])
206
- >>> s.str.to_time("%H:%M")
207
- shape: (3,)
208
- Series: '' [time]
209
- [
210
- 01:00:00
211
- 02:00:00
212
- 03:00:00
213
- ]
214
- """
215
- _validate_format_argument(format)
216
- return wrap_expr(self._pyexpr.str_to_time(format, strict, cache))
217
-
218
- def strptime(
219
- self,
220
- dtype: PolarsTemporalType,
221
- format: str | None = None,
222
- *,
223
- strict: bool = True,
224
- exact: bool = True,
225
- cache: bool = True,
226
- ambiguous: Ambiguous | Expr = "raise",
227
- ) -> Expr:
228
- """
229
- Convert a String column into a Date/Datetime/Time column.
230
-
231
- Parameters
232
- ----------
233
- dtype
234
- The data type to convert into. Can be either Date, Datetime, or Time.
235
- format
236
- Format to use for conversion. Refer to the `chrono crate documentation
237
- <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
238
- for the full specification. Example: `"%Y-%m-%d %H:%M:%S"`.
239
- If set to None (default), the format is inferred from the data.
240
- strict
241
- Raise an error if any conversion fails.
242
- exact
243
- Require an exact format match. If False, allow the format to match anywhere
244
- in the target string. Conversion to the Time type is always exact.
245
-
246
- .. note::
247
- Using `exact=False` introduces a performance penalty - cleaning your
248
- data beforehand will almost certainly be more performant.
249
- cache
250
- Use a cache of unique, converted dates to apply the datetime conversion.
251
- ambiguous
252
- Determine how to deal with ambiguous datetimes:
253
-
254
- - `'raise'` (default): raise
255
- - `'earliest'`: use the earliest datetime
256
- - `'latest'`: use the latest datetime
257
- - `'null'`: set to null
258
-
259
- Notes
260
- -----
261
- When converting to a Datetime type, the time unit is inferred from the format
262
- string if given, eg: `"%F %T%.3f"` => `Datetime("ms")`. If no fractional
263
- second component is found, the default is `"us"`.
264
-
265
- Examples
266
- --------
267
- Dealing with a consistent format:
268
-
269
- >>> s = pl.Series(["2020-01-01 01:00Z", "2020-01-01 02:00Z"])
270
- >>> s.str.strptime(pl.Datetime, "%Y-%m-%d %H:%M%#z")
271
- shape: (2,)
272
- Series: '' [datetime[μs, UTC]]
273
- [
274
- 2020-01-01 01:00:00 UTC
275
- 2020-01-01 02:00:00 UTC
276
- ]
277
-
278
- Dealing with different formats.
279
-
280
- >>> s = pl.Series(
281
- ... "date",
282
- ... [
283
- ... "2021-04-22",
284
- ... "2022-01-04 00:00:00",
285
- ... "01/31/22",
286
- ... "Sun Jul 8 00:34:60 2001",
287
- ... ],
288
- ... )
289
- >>> s.to_frame().select(
290
- ... pl.coalesce(
291
- ... pl.col("date").str.strptime(pl.Date, "%F", strict=False),
292
- ... pl.col("date").str.strptime(pl.Date, "%F %T", strict=False),
293
- ... pl.col("date").str.strptime(pl.Date, "%D", strict=False),
294
- ... pl.col("date").str.strptime(pl.Date, "%c", strict=False),
295
- ... )
296
- ... ).to_series()
297
- shape: (4,)
298
- Series: 'date' [date]
299
- [
300
- 2021-04-22
301
- 2022-01-04
302
- 2022-01-31
303
- 2001-07-08
304
- ]
305
- """
306
- if dtype == Date:
307
- return self.to_date(format, strict=strict, exact=exact, cache=cache)
308
- elif dtype == Datetime:
309
- time_unit = getattr(dtype, "time_unit", None)
310
- time_zone = getattr(dtype, "time_zone", None)
311
- return self.to_datetime(
312
- format,
313
- time_unit=time_unit,
314
- time_zone=time_zone,
315
- strict=strict,
316
- exact=exact,
317
- cache=cache,
318
- ambiguous=ambiguous,
319
- )
320
- elif dtype == Time:
321
- return self.to_time(format, strict=strict, cache=cache)
322
- else:
323
- msg = "`dtype` must be of type {Date, Datetime, Time}"
324
- raise ValueError(msg)
325
-
326
- @deprecate_nonkeyword_arguments(allowed_args=["self"], version="1.20.0")
327
- @unstable()
328
- def to_decimal(self, *, scale: int) -> Expr:
329
- """
330
- Convert a String column into a Decimal column.
331
-
332
- .. warning::
333
- This functionality is considered **unstable**. It may be changed
334
- at any point without it being considered a breaking change.
335
-
336
- .. versionchanged:: 1.20.0
337
- Parameter `inference_length` should now be passed as a keyword argument.
338
-
339
- .. versionchanged:: 1.33.0
340
- Parameter `inference_length` was removed and `scale` was made non-optional.
341
-
342
- Parameters
343
- ----------
344
- scale
345
- Number of digits after the comma to use for the decimals.
346
-
347
- Examples
348
- --------
349
- >>> df = pl.DataFrame(
350
- ... {
351
- ... "numbers": [
352
- ... "40.12",
353
- ... "3420.13",
354
- ... "120134.19",
355
- ... "3212.98",
356
- ... "12.90",
357
- ... "143.09",
358
- ... "143.9",
359
- ... ]
360
- ... }
361
- ... )
362
- >>> df.with_columns(numbers_decimal=pl.col("numbers").str.to_decimal(scale=2))
363
- shape: (7, 2)
364
- ┌───────────┬─────────────────┐
365
- │ numbers ┆ numbers_decimal │
366
- │ --- ┆ --- │
367
- │ str ┆ decimal[38,2] │
368
- ╞═══════════╪═════════════════╡
369
- │ 40.12 ┆ 40.12 │
370
- │ 3420.13 ┆ 3420.13 │
371
- │ 120134.19 ┆ 120134.19 │
372
- │ 3212.98 ┆ 3212.98 │
373
- │ 12.90 ┆ 12.90 │
374
- │ 143.09 ┆ 143.09 │
375
- │ 143.9 ┆ 143.90 │
376
- └───────────┴─────────────────┘
377
- """
378
- return wrap_expr(self._pyexpr.str_to_decimal(scale=scale))
379
-
380
- def len_bytes(self) -> Expr:
381
- """
382
- Return the length of each string as the number of bytes.
383
-
384
- Returns
385
- -------
386
- Expr
387
- Expression of data type :class:`UInt32`.
388
-
389
- See Also
390
- --------
391
- len_chars
392
-
393
- Notes
394
- -----
395
- When working with non-ASCII text, the length in bytes is not the same as the
396
- length in characters. You may want to use :func:`len_chars` instead.
397
- Note that :func:`len_bytes` is much more performant (_O(1)_) than
398
- :func:`len_chars` (_O(n)_).
399
-
400
- Examples
401
- --------
402
- >>> df = pl.DataFrame({"a": ["Café", "345", "東京", None]})
403
- >>> df.with_columns(
404
- ... pl.col("a").str.len_bytes().alias("n_bytes"),
405
- ... pl.col("a").str.len_chars().alias("n_chars"),
406
- ... )
407
- shape: (4, 3)
408
- ┌──────┬─────────┬─────────┐
409
- │ a ┆ n_bytes ┆ n_chars │
410
- │ --- ┆ --- ┆ --- │
411
- │ str ┆ u32 ┆ u32 │
412
- ╞══════╪═════════╪═════════╡
413
- │ Café ┆ 5 ┆ 4 │
414
- │ 345 ┆ 3 ┆ 3 │
415
- │ 東京 ┆ 6 ┆ 2 │
416
- │ null ┆ null ┆ null │
417
- └──────┴─────────┴─────────┘
418
- """
419
- return wrap_expr(self._pyexpr.str_len_bytes())
420
-
421
- def len_chars(self) -> Expr:
422
- """
423
- Return the length of each string as the number of characters.
424
-
425
- Returns
426
- -------
427
- Expr
428
- Expression of data type :class:`UInt32`.
429
-
430
- See Also
431
- --------
432
- len_bytes
433
-
434
- Notes
435
- -----
436
- When working with ASCII text, use :func:`len_bytes` instead to achieve
437
- equivalent output with much better performance:
438
- :func:`len_bytes` runs in _O(1)_, while :func:`len_chars` runs in (_O(n)_).
439
-
440
- A character is defined as a `Unicode scalar value`_. A single character is
441
- represented by a single byte when working with ASCII text, and a maximum of
442
- 4 bytes otherwise.
443
-
444
- .. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value
445
-
446
- Examples
447
- --------
448
- >>> df = pl.DataFrame({"a": ["Café", "345", "東京", None]})
449
- >>> df.with_columns(
450
- ... pl.col("a").str.len_chars().alias("n_chars"),
451
- ... pl.col("a").str.len_bytes().alias("n_bytes"),
452
- ... )
453
- shape: (4, 3)
454
- ┌──────┬─────────┬─────────┐
455
- │ a ┆ n_chars ┆ n_bytes │
456
- │ --- ┆ --- ┆ --- │
457
- │ str ┆ u32 ┆ u32 │
458
- ╞══════╪═════════╪═════════╡
459
- │ Café ┆ 4 ┆ 5 │
460
- │ 345 ┆ 3 ┆ 3 │
461
- │ 東京 ┆ 2 ┆ 6 │
462
- │ null ┆ null ┆ null │
463
- └──────┴─────────┴─────────┘
464
- """
465
- return wrap_expr(self._pyexpr.str_len_chars())
466
-
467
- def to_uppercase(self) -> Expr:
468
- """
469
- Modify strings to their uppercase equivalent.
470
-
471
- Examples
472
- --------
473
- >>> df = pl.DataFrame({"foo": ["cat", "dog"]})
474
- >>> df.with_columns(foo_upper=pl.col("foo").str.to_uppercase())
475
- shape: (2, 2)
476
- ┌─────┬───────────┐
477
- │ foo ┆ foo_upper │
478
- │ --- ┆ --- │
479
- │ str ┆ str │
480
- ╞═════╪═══════════╡
481
- │ cat ┆ CAT │
482
- │ dog ┆ DOG │
483
- └─────┴───────────┘
484
- """
485
- return wrap_expr(self._pyexpr.str_to_uppercase())
486
-
487
- def to_lowercase(self) -> Expr:
488
- """
489
- Modify strings to their lowercase equivalent.
490
-
491
- Examples
492
- --------
493
- >>> df = pl.DataFrame({"foo": ["CAT", "DOG"]})
494
- >>> df.with_columns(foo_lower=pl.col("foo").str.to_lowercase())
495
- shape: (2, 2)
496
- ┌─────┬───────────┐
497
- │ foo ┆ foo_lower │
498
- │ --- ┆ --- │
499
- │ str ┆ str │
500
- ╞═════╪═══════════╡
501
- │ CAT ┆ cat │
502
- │ DOG ┆ dog │
503
- └─────┴───────────┘
504
- """
505
- return wrap_expr(self._pyexpr.str_to_lowercase())
506
-
507
- def to_titlecase(self) -> Expr:
508
- """
509
- Modify strings to their titlecase equivalent.
510
-
511
- Notes
512
- -----
513
- This is a form of case transform where the first letter of each word is
514
- capitalized, with the rest of the word in lowercase. Non-alphanumeric
515
- characters define the word boundaries.
516
-
517
- Examples
518
- --------
519
- >>> df = pl.DataFrame(
520
- ... {
521
- ... "quotes": [
522
- ... "'e.t. phone home'",
523
- ... "you talkin' to me?",
524
- ... "to infinity,and BEYOND!",
525
- ... ]
526
- ... }
527
- ... )
528
- >>> df.with_columns(
529
- ... quotes_title=pl.col("quotes").str.to_titlecase(),
530
- ... )
531
- shape: (3, 2)
532
- ┌─────────────────────────┬─────────────────────────┐
533
- │ quotes ┆ quotes_title │
534
- │ --- ┆ --- │
535
- │ str ┆ str │
536
- ╞═════════════════════════╪═════════════════════════╡
537
- │ 'e.t. phone home' ┆ 'E.T. Phone Home' │
538
- │ you talkin' to me? ┆ You Talkin' To Me? │
539
- │ to infinity,and BEYOND! ┆ To Infinity,And Beyond! │
540
- └─────────────────────────┴─────────────────────────┘
541
- """
542
- return wrap_expr(self._pyexpr.str_to_titlecase())
543
-
544
- def strip_chars(self, characters: IntoExpr = None) -> Expr:
545
- r"""
546
- Remove leading and trailing characters.
547
-
548
- Parameters
549
- ----------
550
- characters
551
- The set of characters to be removed. All combinations of this set of
552
- characters will be stripped from the start and end of the string. If set to
553
- None (default), all leading and trailing whitespace is removed instead.
554
-
555
- Examples
556
- --------
557
- >>> df = pl.DataFrame({"foo": [" hello", "\nworld"]})
558
- >>> df
559
- shape: (2, 1)
560
- ┌────────┐
561
- │ foo │
562
- │ --- │
563
- │ str │
564
- ╞════════╡
565
- │ hello │
566
- │ │
567
- │ world │
568
- └────────┘
569
-
570
- >>> df.with_columns(foo_stripped=pl.col("foo").str.strip_chars())
571
- shape: (2, 2)
572
- ┌────────┬──────────────┐
573
- │ foo ┆ foo_stripped │
574
- │ --- ┆ --- │
575
- │ str ┆ str │
576
- ╞════════╪══════════════╡
577
- │ hello ┆ hello │
578
- │ ┆ world │
579
- │ world ┆ │
580
- └────────┴──────────────┘
581
-
582
- Characters can be stripped by passing a string as argument. Note that whitespace
583
- will not be stripped automatically when doing so, unless that whitespace is
584
- also included in the string.
585
-
586
- >>> df.with_columns(foo_stripped=pl.col("foo").str.strip_chars("ow\n"))
587
- shape: (2, 2)
588
- ┌────────┬──────────────┐
589
- │ foo ┆ foo_stripped │
590
- │ --- ┆ --- │
591
- │ str ┆ str │
592
- ╞════════╪══════════════╡
593
- │ hello ┆ hell │
594
- │ ┆ rld │
595
- │ world ┆ │
596
- └────────┴──────────────┘
597
- """
598
- characters_pyexpr = parse_into_expression(characters, str_as_lit=True)
599
- return wrap_expr(self._pyexpr.str_strip_chars(characters_pyexpr))
600
-
601
- def strip_chars_start(self, characters: IntoExpr = None) -> Expr:
602
- r"""
603
- Remove leading characters.
604
-
605
- .. note::
606
- This method strips any characters present in `characters` from the
607
- start of the input, no matter their order. To strip a prefix (i.e.
608
- a "word" of characters in a certain order), use
609
- :func:`strip_prefix` instead.
610
-
611
- Parameters
612
- ----------
613
- characters
614
- The set of characters to be removed. All combinations of this set of
615
- characters will be stripped from the start of the string. If set to None
616
- (default), all leading whitespace is removed instead.
617
-
618
- See Also
619
- --------
620
- strip_prefix
621
- strip_chars_end
622
-
623
- Examples
624
- --------
625
- >>> df = pl.DataFrame({"foo": [" hello ", "\tworld"]})
626
- >>> df.with_columns(foo_strip_start=pl.col("foo").str.strip_chars_start())
627
- shape: (2, 2)
628
- ┌─────────┬─────────────────┐
629
- │ foo ┆ foo_strip_start │
630
- │ --- ┆ --- │
631
- │ str ┆ str │
632
- ╞═════════╪═════════════════╡
633
- │ hello ┆ hello │
634
- │ world ┆ world │
635
- └─────────┴─────────────────┘
636
-
637
- Characters can be stripped by passing a string as argument. Note that whitespace
638
- will not be stripped automatically when doing so.
639
-
640
- >>> df.with_columns(
641
- ... foo_strip_start=pl.col("foo").str.strip_chars_start("wod\t"),
642
- ... )
643
- shape: (2, 2)
644
- ┌─────────┬─────────────────┐
645
- │ foo ┆ foo_strip_start │
646
- │ --- ┆ --- │
647
- │ str ┆ str │
648
- ╞═════════╪═════════════════╡
649
- │ hello ┆ hello │
650
- │ world ┆ rld │
651
- └─────────┴─────────────────┘
652
-
653
- The order of the provided characters does not matter, they behave like a set.
654
-
655
- >>> pl.DataFrame({"foo": ["aabcdef"]}).with_columns(
656
- ... foo_strip_start=pl.col("foo").str.strip_chars_start("cba")
657
- ... )
658
- shape: (1, 2)
659
- ┌─────────┬─────────────────┐
660
- │ foo ┆ foo_strip_start │
661
- │ --- ┆ --- │
662
- │ str ┆ str │
663
- ╞═════════╪═════════════════╡
664
- │ aabcdef ┆ def │
665
- └─────────┴─────────────────┘
666
- """
667
- characters_pyexpr = parse_into_expression(characters, str_as_lit=True)
668
- return wrap_expr(self._pyexpr.str_strip_chars_start(characters_pyexpr))
669
-
670
- def strip_chars_end(self, characters: IntoExpr = None) -> Expr:
671
- r"""
672
- Remove trailing characters.
673
-
674
- .. note::
675
- This method strips any characters present in `characters` from the
676
- end of the input, no matter their order. To strip a suffix (i.e.
677
- a "word" of characters in a certain order), use
678
- :func:`strip_suffix` instead.
679
-
680
- Parameters
681
- ----------
682
- characters
683
- The set of characters to be removed. All combinations of this set of
684
- characters will be stripped from the end of the string. If set to None
685
- (default), all trailing whitespace is removed instead.
686
-
687
- See Also
688
- --------
689
- strip_suffix
690
- strip_chars_start
691
-
692
- Examples
693
- --------
694
- >>> df = pl.DataFrame({"foo": [" hello", "world\n"]})
695
- >>> df
696
- shape: (2, 1)
697
- ┌────────┐
698
- │ foo │
699
- │ --- │
700
- │ str │
701
- ╞════════╡
702
- │ hello │
703
- │ world │
704
- │ │
705
- └────────┘
706
- >>> df.with_columns(foo_strip_end=pl.col("foo").str.strip_chars_end())
707
- shape: (2, 2)
708
- ┌────────┬───────────────┐
709
- │ foo ┆ foo_strip_end │
710
- │ --- ┆ --- │
711
- │ str ┆ str │
712
- ╞════════╪═══════════════╡
713
- │ hello ┆ hello │
714
- │ world ┆ world │
715
- │ ┆ │
716
- └────────┴───────────────┘
717
-
718
- Characters can be stripped by passing a string as argument. Note that whitespace
719
- will not be stripped automatically when doing so, unless that whitespace is
720
- also included in the string.
721
-
722
- >>> df.with_columns(foo_strip_end=pl.col("foo").str.strip_chars_end("oldw "))
723
- shape: (2, 2)
724
- ┌────────┬───────────────┐
725
- │ foo ┆ foo_strip_end │
726
- │ --- ┆ --- │
727
- │ str ┆ str │
728
- ╞════════╪═══════════════╡
729
- │ hello ┆ he │
730
- │ world ┆ world │
731
- │ ┆ │
732
- └────────┴───────────────┘
733
-
734
- The order of the provided characters does not matter, they behave like a set.
735
-
736
- >>> pl.DataFrame({"foo": ["abcdeff"]}).with_columns(
737
- ... foo_strip_end=pl.col("foo").str.strip_chars_end("fed")
738
- ... )
739
- shape: (1, 2)
740
- ┌─────────┬───────────────┐
741
- │ foo ┆ foo_strip_end │
742
- │ --- ┆ --- │
743
- │ str ┆ str │
744
- ╞═════════╪═══════════════╡
745
- │ abcdeff ┆ abc │
746
- └─────────┴───────────────┘
747
- """
748
- characters_pyexpr = parse_into_expression(characters, str_as_lit=True)
749
- return wrap_expr(self._pyexpr.str_strip_chars_end(characters_pyexpr))
750
-
751
- def strip_prefix(self, prefix: IntoExpr) -> Expr:
752
- """
753
- Remove prefix.
754
-
755
- The prefix will be removed from the string exactly once, if found.
756
-
757
- .. note::
758
- This method strips the exact character sequence provided in
759
- `prefix` from the start of the input. To strip a set of characters
760
- in any order, use :func:`strip_chars_start` instead.
761
-
762
- Parameters
763
- ----------
764
- prefix
765
- The prefix to be removed.
766
-
767
- See Also
768
- --------
769
- strip_chars_start
770
- strip_suffix
771
-
772
- Examples
773
- --------
774
- >>> df = pl.DataFrame({"a": ["foobar", "foofoobar", "foo", "bar"]})
775
- >>> df.with_columns(pl.col("a").str.strip_prefix("foo").alias("stripped"))
776
- shape: (4, 2)
777
- ┌───────────┬──────────┐
778
- │ a ┆ stripped │
779
- │ --- ┆ --- │
780
- │ str ┆ str │
781
- ╞═══════════╪══════════╡
782
- │ foobar ┆ bar │
783
- │ foofoobar ┆ foobar │
784
- │ foo ┆ │
785
- │ bar ┆ bar │
786
- └───────────┴──────────┘
787
- """
788
- prefix_pyexpr = parse_into_expression(prefix, str_as_lit=True)
789
- return wrap_expr(self._pyexpr.str_strip_prefix(prefix_pyexpr))
790
-
791
- def strip_suffix(self, suffix: IntoExpr) -> Expr:
792
- """
793
- Remove suffix.
794
-
795
- The suffix will be removed from the string exactly once, if found.
796
-
797
- .. note::
798
- This method strips the exact character sequence provided in
799
- `suffix` from the end of the input. To strip a set of characters
800
- in any order, use :func:`strip_chars_end` instead.
801
-
802
- Parameters
803
- ----------
804
- suffix
805
- The suffix to be removed.
806
-
807
- See Also
808
- --------
809
- strip_chars_end
810
- strip_prefix
811
-
812
- Examples
813
- --------
814
- >>> df = pl.DataFrame({"a": ["foobar", "foobarbar", "foo", "bar"]})
815
- >>> df.with_columns(pl.col("a").str.strip_suffix("bar").alias("stripped"))
816
- shape: (4, 2)
817
- ┌───────────┬──────────┐
818
- │ a ┆ stripped │
819
- │ --- ┆ --- │
820
- │ str ┆ str │
821
- ╞═══════════╪══════════╡
822
- │ foobar ┆ foo │
823
- │ foobarbar ┆ foobar │
824
- │ foo ┆ foo │
825
- │ bar ┆ │
826
- └───────────┴──────────┘
827
- """
828
- suffix_pyexpr = parse_into_expression(suffix, str_as_lit=True)
829
- return wrap_expr(self._pyexpr.str_strip_suffix(suffix_pyexpr))
830
-
831
- def pad_start(self, length: int | IntoExprColumn, fill_char: str = " ") -> Expr:
832
- """
833
- Pad the start of the string until it reaches the given length.
834
-
835
- Parameters
836
- ----------
837
- length
838
- Pad the string until it reaches this length. Strings with length equal to or
839
- greater than this value are returned as-is.
840
- fill_char
841
- The character to pad the string with.
842
-
843
- See Also
844
- --------
845
- pad_end
846
- zfill
847
-
848
- Examples
849
- --------
850
- >>> df = pl.DataFrame({"a": ["cow", "monkey", "hippopotamus", None]})
851
- >>> df.with_columns(padded=pl.col("a").str.pad_start(8, "*"))
852
- shape: (4, 2)
853
- ┌──────────────┬──────────────┐
854
- │ a ┆ padded │
855
- │ --- ┆ --- │
856
- │ str ┆ str │
857
- ╞══════════════╪══════════════╡
858
- │ cow ┆ *****cow │
859
- │ monkey ┆ **monkey │
860
- │ hippopotamus ┆ hippopotamus │
861
- │ null ┆ null │
862
- └──────────────┴──────────────┘
863
- """
864
- length_pyexpr = parse_into_expression(length)
865
- if not isinstance(fill_char, str):
866
- msg = f'"pad_start" expects a `str`, given a {qualified_type_name(fill_char)!r}'
867
- raise TypeError(msg)
868
- return wrap_expr(self._pyexpr.str_pad_start(length_pyexpr, fill_char))
869
-
870
- def pad_end(self, length: int | IntoExprColumn, fill_char: str = " ") -> Expr:
871
- """
872
- Pad the end of the string until it reaches the given length.
873
-
874
- Parameters
875
- ----------
876
- length
877
- Pad the string until it reaches this length. Strings with length equal to or
878
- greater than this value are returned as-is. Can be int or expression.
879
- fill_char
880
- The character to pad the string with.
881
-
882
- See Also
883
- --------
884
- pad_start
885
-
886
- Examples
887
- --------
888
- >>> df = pl.DataFrame({"a": ["cow", "monkey", "hippopotamus", None]})
889
- >>> df.with_columns(padded=pl.col("a").str.pad_end(8, "*"))
890
- shape: (4, 2)
891
- ┌──────────────┬──────────────┐
892
- │ a ┆ padded │
893
- │ --- ┆ --- │
894
- │ str ┆ str │
895
- ╞══════════════╪══════════════╡
896
- │ cow ┆ cow***** │
897
- │ monkey ┆ monkey** │
898
- │ hippopotamus ┆ hippopotamus │
899
- │ null ┆ null │
900
- └──────────────┴──────────────┘
901
- """
902
- length_pyexpr = parse_into_expression(length)
903
- if not isinstance(fill_char, str):
904
- msg = (
905
- f'"pad_end" expects a `str`, given a {qualified_type_name(fill_char)!r}'
906
- )
907
- raise TypeError(msg)
908
- return wrap_expr(self._pyexpr.str_pad_end(length_pyexpr, fill_char))
909
-
910
- def zfill(self, length: int | IntoExprColumn) -> Expr:
911
- """
912
- Pad the start of the string with zeros until it reaches the given length.
913
-
914
- A sign prefix (`-`) is handled by inserting the padding after the sign
915
- character rather than before.
916
-
917
- Parameters
918
- ----------
919
- length
920
- Pad the string until it reaches this length. Strings with length equal to
921
- or greater than this value are returned as-is.
922
-
923
- See Also
924
- --------
925
- pad_start
926
-
927
- Notes
928
- -----
929
- This method is intended for padding numeric strings. If your data contains
930
- non-ASCII characters, use :func:`pad_start` instead.
931
-
932
- Examples
933
- --------
934
- >>> df = pl.DataFrame({"a": [-1, 123, 999999, None]})
935
- >>> df.with_columns(zfill=pl.col("a").cast(pl.String).str.zfill(4))
936
- shape: (4, 2)
937
- ┌────────┬────────┐
938
- │ a ┆ zfill │
939
- │ --- ┆ --- │
940
- │ i64 ┆ str │
941
- ╞════════╪════════╡
942
- │ -1 ┆ -001 │
943
- │ 123 ┆ 0123 │
944
- │ 999999 ┆ 999999 │
945
- │ null ┆ null │
946
- └────────┴────────┘
947
- >>> df = pl.DataFrame(
948
- ... {
949
- ... "a": [-1, 123, 999999, None],
950
- ... "length": [8, 4, 1, 2],
951
- ... }
952
- ... )
953
- >>> df.with_columns(zfill=pl.col("a").cast(pl.String).str.zfill("length"))
954
- shape: (4, 3)
955
- ┌────────┬────────┬──────────┐
956
- │ a ┆ length ┆ zfill │
957
- │ --- ┆ --- ┆ --- │
958
- │ i64 ┆ i64 ┆ str │
959
- ╞════════╪════════╪══════════╡
960
- │ -1 ┆ 8 ┆ -0000001 │
961
- │ 123 ┆ 4 ┆ 0123 │
962
- │ 999999 ┆ 1 ┆ 999999 │
963
- │ null ┆ 2 ┆ null │
964
- └────────┴────────┴──────────┘
965
- """
966
- length_pyexpr = parse_into_expression(length)
967
- return wrap_expr(self._pyexpr.str_zfill(length_pyexpr))
968
-
969
- def contains(
970
- self, pattern: str | Expr, *, literal: bool = False, strict: bool = True
971
- ) -> Expr:
972
- """
973
- Check if the string contains a substring that matches a pattern.
974
-
975
- Parameters
976
- ----------
977
- pattern
978
- A valid regular expression pattern, compatible with the `regex crate
979
- <https://docs.rs/regex/latest/regex/>`_.
980
- literal
981
- Treat `pattern` as a literal string, not as a regular expression.
982
- strict
983
- Raise an error if the underlying pattern is not a valid regex,
984
- otherwise mask out with a null value.
985
-
986
- Notes
987
- -----
988
- To modify regular expression behaviour (such as case-sensitivity) with
989
- flags, use the inline `(?iLmsuxU)` syntax. For example:
990
-
991
- >>> pl.DataFrame({"s": ["AAA", "aAa", "aaa"]}).with_columns(
992
- ... default_match=pl.col("s").str.contains("AA"),
993
- ... insensitive_match=pl.col("s").str.contains("(?i)AA"),
994
- ... )
995
- shape: (3, 3)
996
- ┌─────┬───────────────┬───────────────────┐
997
- │ s ┆ default_match ┆ insensitive_match │
998
- │ --- ┆ --- ┆ --- │
999
- │ str ┆ bool ┆ bool │
1000
- ╞═════╪═══════════════╪═══════════════════╡
1001
- │ AAA ┆ true ┆ true │
1002
- │ aAa ┆ false ┆ true │
1003
- │ aaa ┆ false ┆ true │
1004
- └─────┴───────────────┴───────────────────┘
1005
-
1006
- See the regex crate's section on `grouping and flags
1007
- <https://docs.rs/regex/latest/regex/#grouping-and-flags>`_ for
1008
- additional information about the use of inline expression modifiers.
1009
-
1010
- See Also
1011
- --------
1012
- starts_with : Check if string values start with a substring.
1013
- ends_with : Check if string values end with a substring.
1014
- find: Return the index of the first substring matching a pattern.
1015
-
1016
- Examples
1017
- --------
1018
- >>> df = pl.DataFrame({"txt": ["Crab", "cat and dog", "rab$bit", None]})
1019
- >>> df.select(
1020
- ... pl.col("txt"),
1021
- ... pl.col("txt").str.contains("cat|bit").alias("regex"),
1022
- ... pl.col("txt").str.contains("rab$", literal=True).alias("literal"),
1023
- ... )
1024
- shape: (4, 3)
1025
- ┌─────────────┬───────┬─────────┐
1026
- │ txt ┆ regex ┆ literal │
1027
- │ --- ┆ --- ┆ --- │
1028
- │ str ┆ bool ┆ bool │
1029
- ╞═════════════╪═══════╪═════════╡
1030
- │ Crab ┆ false ┆ false │
1031
- │ cat and dog ┆ true ┆ false │
1032
- │ rab$bit ┆ true ┆ true │
1033
- │ null ┆ null ┆ null │
1034
- └─────────────┴───────┴─────────┘
1035
- """
1036
- pattern_pyexpr = parse_into_expression(pattern, str_as_lit=True)
1037
- return wrap_expr(self._pyexpr.str_contains(pattern_pyexpr, literal, strict))
1038
-
1039
- def find(
1040
- self, pattern: str | Expr, *, literal: bool = False, strict: bool = True
1041
- ) -> Expr:
1042
- """
1043
- Return the bytes offset of the first substring matching a pattern.
1044
-
1045
- If the pattern is not found, returns None.
1046
-
1047
- Parameters
1048
- ----------
1049
- pattern
1050
- A valid regular expression pattern, compatible with the `regex crate
1051
- <https://docs.rs/regex/latest/regex/>`_.
1052
- literal
1053
- Treat `pattern` as a literal string, not as a regular expression.
1054
- strict
1055
- Raise an error if the underlying pattern is not a valid regex,
1056
- otherwise mask out with a null value.
1057
-
1058
- Notes
1059
- -----
1060
- To modify regular expression behaviour (such as case-sensitivity) with
1061
- flags, use the inline `(?iLmsuxU)` syntax. For example:
1062
-
1063
- >>> pl.DataFrame({"s": ["AAA", "aAa", "aaa"]}).with_columns(
1064
- ... default_match=pl.col("s").str.find("Aa"),
1065
- ... insensitive_match=pl.col("s").str.find("(?i)Aa"),
1066
- ... )
1067
- shape: (3, 3)
1068
- ┌─────┬───────────────┬───────────────────┐
1069
- │ s ┆ default_match ┆ insensitive_match │
1070
- │ --- ┆ --- ┆ --- │
1071
- │ str ┆ u32 ┆ u32 │
1072
- ╞═════╪═══════════════╪═══════════════════╡
1073
- │ AAA ┆ null ┆ 0 │
1074
- │ aAa ┆ 1 ┆ 0 │
1075
- │ aaa ┆ null ┆ 0 │
1076
- └─────┴───────────────┴───────────────────┘
1077
-
1078
- See the regex crate's section on `grouping and flags
1079
- <https://docs.rs/regex/latest/regex/#grouping-and-flags>`_ for
1080
- additional information about the use of inline expression modifiers.
1081
-
1082
- See Also
1083
- --------
1084
- contains : Check if the string contains a substring that matches a pattern.
1085
-
1086
- Examples
1087
- --------
1088
- >>> df = pl.DataFrame(
1089
- ... {
1090
- ... "txt": ["Crab", "Lobster", None, "Crustacean"],
1091
- ... "pat": ["a[bc]", "b.t", "[aeiuo]", "(?i)A[BC]"],
1092
- ... }
1093
- ... )
1094
-
1095
- Find the index of the first substring matching a regex or literal pattern:
1096
-
1097
- >>> df.select(
1098
- ... pl.col("txt"),
1099
- ... pl.col("txt").str.find("a|e").alias("a|e (regex)"),
1100
- ... pl.col("txt").str.find("e", literal=True).alias("e (lit)"),
1101
- ... )
1102
- shape: (4, 3)
1103
- ┌────────────┬─────────────┬─────────┐
1104
- │ txt ┆ a|e (regex) ┆ e (lit) │
1105
- │ --- ┆ --- ┆ --- │
1106
- │ str ┆ u32 ┆ u32 │
1107
- ╞════════════╪═════════════╪═════════╡
1108
- │ Crab ┆ 2 ┆ null │
1109
- │ Lobster ┆ 5 ┆ 5 │
1110
- │ null ┆ null ┆ null │
1111
- │ Crustacean ┆ 5 ┆ 7 │
1112
- └────────────┴─────────────┴─────────┘
1113
-
1114
- Match against a pattern found in another column or (expression):
1115
-
1116
- >>> df.with_columns(pl.col("txt").str.find(pl.col("pat")).alias("find_pat"))
1117
- shape: (4, 3)
1118
- ┌────────────┬───────────┬──────────┐
1119
- │ txt ┆ pat ┆ find_pat │
1120
- │ --- ┆ --- ┆ --- │
1121
- │ str ┆ str ┆ u32 │
1122
- ╞════════════╪═══════════╪══════════╡
1123
- │ Crab ┆ a[bc] ┆ 2 │
1124
- │ Lobster ┆ b.t ┆ 2 │
1125
- │ null ┆ [aeiuo] ┆ null │
1126
- │ Crustacean ┆ (?i)A[BC] ┆ 5 │
1127
- └────────────┴───────────┴──────────┘
1128
- """
1129
- pattern_pyexpr = parse_into_expression(pattern, str_as_lit=True)
1130
- return wrap_expr(self._pyexpr.str_find(pattern_pyexpr, literal, strict))
1131
-
1132
- def ends_with(self, suffix: str | Expr) -> Expr:
1133
- """
1134
- Check if string values end with a substring.
1135
-
1136
- Parameters
1137
- ----------
1138
- suffix
1139
- Suffix substring.
1140
-
1141
- See Also
1142
- --------
1143
- contains : Check if the string contains a substring that matches a pattern.
1144
- starts_with : Check if string values start with a substring.
1145
-
1146
- Examples
1147
- --------
1148
- >>> df = pl.DataFrame({"fruits": ["apple", "mango", None]})
1149
- >>> df.with_columns(
1150
- ... pl.col("fruits").str.ends_with("go").alias("has_suffix"),
1151
- ... )
1152
- shape: (3, 2)
1153
- ┌────────┬────────────┐
1154
- │ fruits ┆ has_suffix │
1155
- │ --- ┆ --- │
1156
- │ str ┆ bool │
1157
- ╞════════╪════════════╡
1158
- │ apple ┆ false │
1159
- │ mango ┆ true │
1160
- │ null ┆ null │
1161
- └────────┴────────────┘
1162
-
1163
- >>> df = pl.DataFrame(
1164
- ... {"fruits": ["apple", "mango", "banana"], "suffix": ["le", "go", "nu"]}
1165
- ... )
1166
- >>> df.with_columns(
1167
- ... pl.col("fruits").str.ends_with(pl.col("suffix")).alias("has_suffix"),
1168
- ... )
1169
- shape: (3, 3)
1170
- ┌────────┬────────┬────────────┐
1171
- │ fruits ┆ suffix ┆ has_suffix │
1172
- │ --- ┆ --- ┆ --- │
1173
- │ str ┆ str ┆ bool │
1174
- ╞════════╪════════╪════════════╡
1175
- │ apple ┆ le ┆ true │
1176
- │ mango ┆ go ┆ true │
1177
- │ banana ┆ nu ┆ false │
1178
- └────────┴────────┴────────────┘
1179
-
1180
- Using `ends_with` as a filter condition:
1181
-
1182
- >>> df.filter(pl.col("fruits").str.ends_with("go"))
1183
- shape: (1, 2)
1184
- ┌────────┬────────┐
1185
- │ fruits ┆ suffix │
1186
- │ --- ┆ --- │
1187
- │ str ┆ str │
1188
- ╞════════╪════════╡
1189
- │ mango ┆ go │
1190
- └────────┴────────┘
1191
- """
1192
- suffix_pyexpr = parse_into_expression(suffix, str_as_lit=True)
1193
- return wrap_expr(self._pyexpr.str_ends_with(suffix_pyexpr))
1194
-
1195
- def starts_with(self, prefix: str | Expr) -> Expr:
1196
- """
1197
- Check if string values start with a substring.
1198
-
1199
- Parameters
1200
- ----------
1201
- prefix
1202
- Prefix substring.
1203
-
1204
- See Also
1205
- --------
1206
- contains : Check if the string contains a substring that matches a pattern.
1207
- ends_with : Check if string values end with a substring.
1208
-
1209
- Examples
1210
- --------
1211
- >>> df = pl.DataFrame({"fruits": ["apple", "mango", None]})
1212
- >>> df.with_columns(
1213
- ... pl.col("fruits").str.starts_with("app").alias("has_prefix"),
1214
- ... )
1215
- shape: (3, 2)
1216
- ┌────────┬────────────┐
1217
- │ fruits ┆ has_prefix │
1218
- │ --- ┆ --- │
1219
- │ str ┆ bool │
1220
- ╞════════╪════════════╡
1221
- │ apple ┆ true │
1222
- │ mango ┆ false │
1223
- │ null ┆ null │
1224
- └────────┴────────────┘
1225
-
1226
- >>> df = pl.DataFrame(
1227
- ... {"fruits": ["apple", "mango", "banana"], "prefix": ["app", "na", "ba"]}
1228
- ... )
1229
- >>> df.with_columns(
1230
- ... pl.col("fruits").str.starts_with(pl.col("prefix")).alias("has_prefix"),
1231
- ... )
1232
- shape: (3, 3)
1233
- ┌────────┬────────┬────────────┐
1234
- │ fruits ┆ prefix ┆ has_prefix │
1235
- │ --- ┆ --- ┆ --- │
1236
- │ str ┆ str ┆ bool │
1237
- ╞════════╪════════╪════════════╡
1238
- │ apple ┆ app ┆ true │
1239
- │ mango ┆ na ┆ false │
1240
- │ banana ┆ ba ┆ true │
1241
- └────────┴────────┴────────────┘
1242
-
1243
- Using `starts_with` as a filter condition:
1244
-
1245
- >>> df.filter(pl.col("fruits").str.starts_with("app"))
1246
- shape: (1, 2)
1247
- ┌────────┬────────┐
1248
- │ fruits ┆ prefix │
1249
- │ --- ┆ --- │
1250
- │ str ┆ str │
1251
- ╞════════╪════════╡
1252
- │ apple ┆ app │
1253
- └────────┴────────┘
1254
- """
1255
- prefix_pyexpr = parse_into_expression(prefix, str_as_lit=True)
1256
- return wrap_expr(self._pyexpr.str_starts_with(prefix_pyexpr))
1257
-
1258
- def json_decode(
1259
- self,
1260
- dtype: PolarsDataType | pl.DataTypeExpr,
1261
- *,
1262
- infer_schema_length: int | None = None,
1263
- ) -> Expr:
1264
- """
1265
- Parse string values as JSON.
1266
-
1267
- Throws an error if invalid JSON strings are encountered.
1268
-
1269
- Parameters
1270
- ----------
1271
- dtype
1272
- The dtype to cast the extracted value to.
1273
- infer_schema_length
1274
- Deprecated and ignored.
1275
-
1276
- .. versionchanged: 1.33.0
1277
- Deprecate `infer_schema_length` and make `dtype` non-optional to
1278
- ensure that the planner can determine the output datatype.
1279
-
1280
- See Also
1281
- --------
1282
- json_path_match : Extract the first match from a JSON string using the provided
1283
- JSONPath.
1284
-
1285
- Examples
1286
- --------
1287
- >>> df = pl.DataFrame(
1288
- ... {"json": ['{"a":1, "b": true}', None, '{"a":2, "b": false}']}
1289
- ... )
1290
- >>> dtype = pl.Struct([pl.Field("a", pl.Int64), pl.Field("b", pl.Boolean)])
1291
- >>> df.with_columns(decoded=pl.col("json").str.json_decode(dtype))
1292
- shape: (3, 2)
1293
- ┌─────────────────────┬───────────┐
1294
- │ json ┆ decoded │
1295
- │ --- ┆ --- │
1296
- │ str ┆ struct[2] │
1297
- ╞═════════════════════╪═══════════╡
1298
- │ {"a":1, "b": true} ┆ {1,true} │
1299
- │ null ┆ null │
1300
- │ {"a":2, "b": false} ┆ {2,false} │
1301
- └─────────────────────┴───────────┘
1302
- """
1303
- if dtype is None:
1304
- msg = "`Expr.str.json_decode` needs an explicitly given `dtype` otherwise Polars is not able to determine the output type. If you want to eagerly infer datatype you can use `Series.str.json_decode`."
1305
- raise TypeError(msg)
1306
-
1307
- if infer_schema_length is not None:
1308
- issue_warning(
1309
- "`Expr.str.json_decode` with `infer_schema_length` is deprecated and has no effect on execution.",
1310
- DeprecationWarning,
1311
- )
1312
-
1313
- dtype_expr = parse_into_datatype_expr(dtype)._pydatatype_expr
1314
- return wrap_expr(self._pyexpr.str_json_decode(dtype_expr))
1315
-
1316
- def json_path_match(self, json_path: IntoExprColumn) -> Expr:
1317
- """
1318
- Extract the first match from a JSON string using the provided JSONPath.
1319
-
1320
- Throws errors if invalid JSON strings are encountered. All return values
1321
- are cast to :class:`String`, regardless of the original value.
1322
-
1323
- Documentation on the JSONPath standard can be found
1324
- `here <https://goessner.net/articles/JsonPath/>`_.
1325
-
1326
- Parameters
1327
- ----------
1328
- json_path
1329
- A valid JSONPath query string.
1330
-
1331
- Returns
1332
- -------
1333
- Expr
1334
- Expression of data type :class:`String`. Contains null values if original
1335
- value is null or the json_path returns nothing.
1336
-
1337
- Examples
1338
- --------
1339
- >>> df = pl.DataFrame(
1340
- ... {"json_val": ['{"a":"1"}', None, '{"a":2}', '{"a":2.1}', '{"a":true}']}
1341
- ... )
1342
- >>> df.with_columns(matched=pl.col("json_val").str.json_path_match("$.a"))
1343
- shape: (5, 2)
1344
- ┌────────────┬─────────┐
1345
- │ json_val ┆ matched │
1346
- │ --- ┆ --- │
1347
- │ str ┆ str │
1348
- ╞════════════╪═════════╡
1349
- │ {"a":"1"} ┆ 1 │
1350
- │ null ┆ null │
1351
- │ {"a":2} ┆ 2 │
1352
- │ {"a":2.1} ┆ 2.1 │
1353
- │ {"a":true} ┆ true │
1354
- └────────────┴─────────┘
1355
- """
1356
- json_path_pyexpr = parse_into_expression(json_path, str_as_lit=True)
1357
- return wrap_expr(self._pyexpr.str_json_path_match(json_path_pyexpr))
1358
-
1359
- def decode(self, encoding: TransferEncoding, *, strict: bool = True) -> Expr:
1360
- r"""
1361
- Decode values using the provided encoding.
1362
-
1363
- Parameters
1364
- ----------
1365
- encoding : {'hex', 'base64'}
1366
- The encoding to use.
1367
- strict
1368
- Raise an error if the underlying value cannot be decoded,
1369
- otherwise mask out with a null value.
1370
-
1371
- Returns
1372
- -------
1373
- Expr
1374
- Expression of data type :class:`Binary`.
1375
-
1376
- Examples
1377
- --------
1378
- >>> df = pl.DataFrame({"color": ["000000", "ffff00", "0000ff"]})
1379
- >>> df.with_columns(pl.col("color").str.decode("hex").alias("decoded"))
1380
- shape: (3, 2)
1381
- ┌────────┬─────────────────┐
1382
- │ color ┆ decoded │
1383
- │ --- ┆ --- │
1384
- │ str ┆ binary │
1385
- ╞════════╪═════════════════╡
1386
- │ 000000 ┆ b"\x00\x00\x00" │
1387
- │ ffff00 ┆ b"\xff\xff\x00" │
1388
- │ 0000ff ┆ b"\x00\x00\xff" │
1389
- └────────┴─────────────────┘
1390
- """
1391
- if encoding == "hex":
1392
- return wrap_expr(self._pyexpr.str_hex_decode(strict))
1393
- elif encoding == "base64":
1394
- return wrap_expr(self._pyexpr.str_base64_decode(strict))
1395
- else:
1396
- msg = f"`encoding` must be one of {{'hex', 'base64'}}, got {encoding!r}"
1397
- raise ValueError(msg)
1398
-
1399
- def encode(self, encoding: TransferEncoding) -> Expr:
1400
- """
1401
- Encode values using the provided encoding.
1402
-
1403
- Parameters
1404
- ----------
1405
- encoding : {'hex', 'base64'}
1406
- The encoding to use.
1407
-
1408
- Returns
1409
- -------
1410
- Expr
1411
- Expression of data type :class:`String`.
1412
-
1413
- Examples
1414
- --------
1415
- >>> df = pl.DataFrame({"strings": ["foo", "bar", None]})
1416
- >>> df.with_columns(strings_hex=pl.col("strings").str.encode("hex"))
1417
- shape: (3, 2)
1418
- ┌─────────┬─────────────┐
1419
- │ strings ┆ strings_hex │
1420
- │ --- ┆ --- │
1421
- │ str ┆ str │
1422
- ╞═════════╪═════════════╡
1423
- │ foo ┆ 666f6f │
1424
- │ bar ┆ 626172 │
1425
- │ null ┆ null │
1426
- └─────────┴─────────────┘
1427
- """
1428
- if encoding == "hex":
1429
- return wrap_expr(self._pyexpr.str_hex_encode())
1430
- elif encoding == "base64":
1431
- return wrap_expr(self._pyexpr.str_base64_encode())
1432
- else:
1433
- msg = f"`encoding` must be one of {{'hex', 'base64'}}, got {encoding!r}"
1434
- raise ValueError(msg)
1435
-
1436
- def extract(self, pattern: IntoExprColumn, group_index: int = 1) -> Expr:
1437
- r"""
1438
- Extract the target capture group from provided patterns.
1439
-
1440
- Parameters
1441
- ----------
1442
- pattern
1443
- A valid regular expression pattern containing at least one capture group,
1444
- compatible with the `regex crate <https://docs.rs/regex/latest/regex/>`_.
1445
- group_index
1446
- Index of the targeted capture group.
1447
- Group 0 means the whole pattern, the first group begins at index 1.
1448
- Defaults to the first capture group.
1449
-
1450
- Notes
1451
- -----
1452
- To modify regular expression behaviour (such as multi-line matching)
1453
- with flags, use the inline `(?iLmsuxU)` syntax. For example:
1454
-
1455
- >>> df = pl.DataFrame(
1456
- ... data={
1457
- ... "lines": [
1458
- ... "I Like\nThose\nOdds",
1459
- ... "This is\nThe Way",
1460
- ... ]
1461
- ... }
1462
- ... )
1463
- >>> df.with_columns(
1464
- ... pl.col("lines").str.extract(r"(?m)^(T\w+)", 1).alias("matches"),
1465
- ... )
1466
- shape: (2, 2)
1467
- ┌─────────┬─────────┐
1468
- │ lines ┆ matches │
1469
- │ --- ┆ --- │
1470
- │ str ┆ str │
1471
- ╞═════════╪═════════╡
1472
- │ I Like ┆ Those │
1473
- │ Those ┆ │
1474
- │ Odds ┆ │
1475
- │ This is ┆ This │
1476
- │ The Way ┆ │
1477
- └─────────┴─────────┘
1478
-
1479
- See the regex crate's section on `grouping and flags
1480
- <https://docs.rs/regex/latest/regex/#grouping-and-flags>`_ for
1481
- additional information about the use of inline expression modifiers.
1482
-
1483
- Returns
1484
- -------
1485
- Expr
1486
- Expression of data type :class:`String`. Contains null values if original
1487
- value is null or the regex captures nothing.
1488
-
1489
- Examples
1490
- --------
1491
- >>> df = pl.DataFrame(
1492
- ... {
1493
- ... "url": [
1494
- ... "http://vote.com/ballon_dor?error=404&ref=unknown",
1495
- ... "http://vote.com/ballon_dor?ref=polars&candidate=messi",
1496
- ... "http://vote.com/ballon_dor?candidate=ronaldo&ref=polars",
1497
- ... ]
1498
- ... }
1499
- ... )
1500
- >>> df.select(
1501
- ... pl.col("url").str.extract(r"candidate=(\w+)", 1).alias("candidate"),
1502
- ... pl.col("url").str.extract(r"ref=(\w+)", 1).alias("referer"),
1503
- ... pl.col("url").str.extract(r"error=(\w+)", 1).alias("error"),
1504
- ... )
1505
- shape: (3, 3)
1506
- ┌───────────┬─────────┬───────┐
1507
- │ candidate ┆ referer ┆ error │
1508
- │ --- ┆ --- ┆ --- │
1509
- │ str ┆ str ┆ str │
1510
- ╞═══════════╪═════════╪═══════╡
1511
- │ null ┆ unknown ┆ 404 │
1512
- │ messi ┆ polars ┆ null │
1513
- │ ronaldo ┆ polars ┆ null │
1514
- └───────────┴─────────┴───────┘
1515
- """
1516
- pattern_pyexpr = parse_into_expression(pattern, str_as_lit=True)
1517
- return wrap_expr(self._pyexpr.str_extract(pattern_pyexpr, group_index))
1518
-
1519
- def extract_all(self, pattern: str | Expr) -> Expr:
1520
- r'''
1521
- Extract all matches for the given regex pattern.
1522
-
1523
- Extract each successive non-overlapping regex match in an individual string
1524
- as a list. If the haystack string is `null`, `null` is returned.
1525
-
1526
- Parameters
1527
- ----------
1528
- pattern
1529
- A valid regular expression pattern, compatible with the `regex crate
1530
- <https://docs.rs/regex/latest/regex/>`_.
1531
-
1532
- Notes
1533
- -----
1534
- To modify regular expression behaviour (such as "verbose" mode and/or
1535
- case-sensitive matching) with flags, use the inline `(?iLmsuxU)` syntax.
1536
- For example:
1537
-
1538
- >>> df = pl.DataFrame(
1539
- ... data={
1540
- ... "email": [
1541
- ... "real.email@spam.com",
1542
- ... "some_account@somewhere.net",
1543
- ... "abc.def.ghi.jkl@uvw.xyz.co.uk",
1544
- ... ]
1545
- ... }
1546
- ... )
1547
- >>> # extract name/domain parts from the addresses, using verbose regex
1548
- >>> df.with_columns(
1549
- ... pl.col("email")
1550
- ... .str.extract_all(
1551
- ... r"""(?xi) # activate 'verbose' and 'case-insensitive' flags
1552
- ... [ # (start character group)
1553
- ... A-Z # letters
1554
- ... 0-9 # digits
1555
- ... ._%+\- # special chars
1556
- ... ] # (end character group)
1557
- ... + # 'one or more' quantifier
1558
- ... """
1559
- ... )
1560
- ... .list.to_struct(fields=["name", "domain"])
1561
- ... .alias("email_parts")
1562
- ... ).unnest("email_parts")
1563
- shape: (3, 3)
1564
- ┌───────────────────────────────┬─────────────────┬───────────────┐
1565
- │ email ┆ name ┆ domain │
1566
- │ --- ┆ --- ┆ --- │
1567
- │ str ┆ str ┆ str │
1568
- ╞═══════════════════════════════╪═════════════════╪═══════════════╡
1569
- │ real.email@spam.com ┆ real.email ┆ spam.com │
1570
- │ some_account@somewhere.net ┆ some_account ┆ somewhere.net │
1571
- │ abc.def.ghi.jkl@uvw.xyz.co.uk ┆ abc.def.ghi.jkl ┆ uvw.xyz.co.uk │
1572
- └───────────────────────────────┴─────────────────┴───────────────┘
1573
-
1574
- See the regex crate's section on `grouping and flags
1575
- <https://docs.rs/regex/latest/regex/#grouping-and-flags>`_ for
1576
- additional information about the use of inline expression modifiers.
1577
-
1578
- Returns
1579
- -------
1580
- Expr
1581
- Expression of data type `List(String)`.
1582
-
1583
- Examples
1584
- --------
1585
- >>> df = pl.DataFrame({"foo": ["123 bla 45 asd", "xyz 678 910t", "bar", None]})
1586
- >>> df.select(
1587
- ... pl.col("foo").str.extract_all(r"\d+").alias("extracted_nrs"),
1588
- ... )
1589
- shape: (4, 1)
1590
- ┌────────────────┐
1591
- │ extracted_nrs │
1592
- │ --- │
1593
- │ list[str] │
1594
- ╞════════════════╡
1595
- │ ["123", "45"] │
1596
- │ ["678", "910"] │
1597
- │ [] │
1598
- │ null │
1599
- └────────────────┘
1600
-
1601
- '''
1602
- pattern_pyexpr = parse_into_expression(pattern, str_as_lit=True)
1603
- return wrap_expr(self._pyexpr.str_extract_all(pattern_pyexpr))
1604
-
1605
- def extract_groups(self, pattern: str) -> Expr:
1606
- r"""
1607
- Extract all capture groups for the given regex pattern.
1608
-
1609
- Parameters
1610
- ----------
1611
- pattern
1612
- A valid regular expression pattern containing at least one capture group,
1613
- compatible with the `regex crate <https://docs.rs/regex/latest/regex/>`_.
1614
-
1615
- Notes
1616
- -----
1617
- All group names are **strings**.
1618
-
1619
- If your pattern contains unnamed groups, their numerical position is converted
1620
- to a string.
1621
-
1622
- For example, here we access groups 2 and 3 via the names `"2"` and `"3"`::
1623
-
1624
- >>> df = pl.DataFrame({"col": ["foo bar baz"]})
1625
- >>> (
1626
- ... df.with_columns(
1627
- ... pl.col("col").str.extract_groups(r"(\S+) (\S+) (.+)")
1628
- ... ).select(pl.col("col").struct["2"], pl.col("col").struct["3"])
1629
- ... )
1630
- shape: (1, 2)
1631
- ┌─────┬─────┐
1632
- │ 2 ┆ 3 │
1633
- │ --- ┆ --- │
1634
- │ str ┆ str │
1635
- ╞═════╪═════╡
1636
- │ bar ┆ baz │
1637
- └─────┴─────┘
1638
-
1639
- Returns
1640
- -------
1641
- Expr
1642
- Expression of data type :class:`Struct` with fields of data type
1643
- :class:`String`.
1644
-
1645
- Examples
1646
- --------
1647
- >>> df = pl.DataFrame(
1648
- ... data={
1649
- ... "url": [
1650
- ... "http://vote.com/ballon_dor?candidate=messi&ref=python",
1651
- ... "http://vote.com/ballon_dor?candidate=weghorst&ref=polars",
1652
- ... "http://vote.com/ballon_dor?error=404&ref=rust",
1653
- ... ]
1654
- ... }
1655
- ... )
1656
- >>> pattern = r"candidate=(?<candidate>\w+)&ref=(?<ref>\w+)"
1657
- >>> df.select(captures=pl.col("url").str.extract_groups(pattern)).unnest(
1658
- ... "captures"
1659
- ... )
1660
- shape: (3, 2)
1661
- ┌───────────┬────────┐
1662
- │ candidate ┆ ref │
1663
- │ --- ┆ --- │
1664
- │ str ┆ str │
1665
- ╞═══════════╪════════╡
1666
- │ messi ┆ python │
1667
- │ weghorst ┆ polars │
1668
- │ null ┆ null │
1669
- └───────────┴────────┘
1670
-
1671
- Unnamed groups have their numerical position converted to a string:
1672
-
1673
- >>> pattern = r"candidate=(\w+)&ref=(\w+)"
1674
- >>> (
1675
- ... df.with_columns(
1676
- ... captures=pl.col("url").str.extract_groups(pattern)
1677
- ... ).with_columns(name=pl.col("captures").struct["1"].str.to_uppercase())
1678
- ... )
1679
- shape: (3, 3)
1680
- ┌─────────────────────────────────┬───────────────────────┬──────────┐
1681
- │ url ┆ captures ┆ name │
1682
- │ --- ┆ --- ┆ --- │
1683
- │ str ┆ struct[2] ┆ str │
1684
- ╞═════════════════════════════════╪═══════════════════════╪══════════╡
1685
- │ http://vote.com/ballon_dor?can… ┆ {"messi","python"} ┆ MESSI │
1686
- │ http://vote.com/ballon_dor?can… ┆ {"weghorst","polars"} ┆ WEGHORST │
1687
- │ http://vote.com/ballon_dor?err… ┆ {null,null} ┆ null │
1688
- └─────────────────────────────────┴───────────────────────┴──────────┘
1689
- """
1690
- if not isinstance(pattern, str):
1691
- msg = f'"extract_groups" expects a `str`, given a {qualified_type_name(pattern)!r}'
1692
- raise TypeError(msg)
1693
- return wrap_expr(self._pyexpr.str_extract_groups(pattern))
1694
-
1695
- def count_matches(self, pattern: str | Expr, *, literal: bool = False) -> Expr:
1696
- r"""
1697
- Count all successive non-overlapping regex matches.
1698
-
1699
- Parameters
1700
- ----------
1701
- pattern
1702
- A valid regular expression pattern, compatible with the `regex crate
1703
- <https://docs.rs/regex/latest/regex/>`_.
1704
- literal
1705
- Treat `pattern` as a literal string, not as a regular expression.
1706
-
1707
- Returns
1708
- -------
1709
- Expr
1710
- Expression of data type :class:`UInt32`. Returns null if the
1711
- original value is null.
1712
-
1713
- Examples
1714
- --------
1715
- >>> df = pl.DataFrame({"foo": ["123 bla 45 asd", "xyz 678 910t", "bar", None]})
1716
- >>> df.with_columns(
1717
- ... pl.col("foo").str.count_matches(r"\d").alias("count_digits"),
1718
- ... )
1719
- shape: (4, 2)
1720
- ┌────────────────┬──────────────┐
1721
- │ foo ┆ count_digits │
1722
- │ --- ┆ --- │
1723
- │ str ┆ u32 │
1724
- ╞════════════════╪══════════════╡
1725
- │ 123 bla 45 asd ┆ 5 │
1726
- │ xyz 678 910t ┆ 6 │
1727
- │ bar ┆ 0 │
1728
- │ null ┆ null │
1729
- └────────────────┴──────────────┘
1730
-
1731
- >>> df = pl.DataFrame({"bar": ["12 dbc 3xy", "cat\\w", "1zy3\\d\\d", None]})
1732
- >>> df.with_columns(
1733
- ... pl.col("bar")
1734
- ... .str.count_matches(r"\d", literal=True)
1735
- ... .alias("count_digits"),
1736
- ... )
1737
- shape: (4, 2)
1738
- ┌────────────┬──────────────┐
1739
- │ bar ┆ count_digits │
1740
- │ --- ┆ --- │
1741
- │ str ┆ u32 │
1742
- ╞════════════╪══════════════╡
1743
- │ 12 dbc 3xy ┆ 0 │
1744
- │ cat\w ┆ 0 │
1745
- │ 1zy3\d\d ┆ 2 │
1746
- │ null ┆ null │
1747
- └────────────┴──────────────┘
1748
- """
1749
- pattern_pyexpr = parse_into_expression(pattern, str_as_lit=True)
1750
- return wrap_expr(self._pyexpr.str_count_matches(pattern_pyexpr, literal))
1751
-
1752
- def split(self, by: IntoExpr, *, inclusive: bool = False) -> Expr:
1753
- """
1754
- Split the string by a substring.
1755
-
1756
- Parameters
1757
- ----------
1758
- by
1759
- Substring to split by.
1760
- inclusive
1761
- If True, include the split character/string in the results.
1762
-
1763
- Examples
1764
- --------
1765
- >>> df = pl.DataFrame({"s": ["foo bar", "foo_bar", "foo_bar_baz"]})
1766
- >>> df.with_columns(
1767
- ... pl.col("s").str.split(by="_").alias("split"),
1768
- ... pl.col("s").str.split(by="_", inclusive=True).alias("split_inclusive"),
1769
- ... )
1770
- shape: (3, 3)
1771
- ┌─────────────┬───────────────────────┬─────────────────────────┐
1772
- │ s ┆ split ┆ split_inclusive │
1773
- │ --- ┆ --- ┆ --- │
1774
- │ str ┆ list[str] ┆ list[str] │
1775
- ╞═════════════╪═══════════════════════╪═════════════════════════╡
1776
- │ foo bar ┆ ["foo bar"] ┆ ["foo bar"] │
1777
- │ foo_bar ┆ ["foo", "bar"] ┆ ["foo_", "bar"] │
1778
- │ foo_bar_baz ┆ ["foo", "bar", "baz"] ┆ ["foo_", "bar_", "baz"] │
1779
- └─────────────┴───────────────────────┴─────────────────────────┘
1780
-
1781
- >>> df = pl.DataFrame(
1782
- ... {"s": ["foo^bar", "foo_bar", "foo*bar*baz"], "by": ["_", "_", "*"]}
1783
- ... )
1784
- >>> df.with_columns(
1785
- ... pl.col("s").str.split(by=pl.col("by")).alias("split"),
1786
- ... pl.col("s")
1787
- ... .str.split(by=pl.col("by"), inclusive=True)
1788
- ... .alias("split_inclusive"),
1789
- ... )
1790
- shape: (3, 4)
1791
- ┌─────────────┬─────┬───────────────────────┬─────────────────────────┐
1792
- │ s ┆ by ┆ split ┆ split_inclusive │
1793
- │ --- ┆ --- ┆ --- ┆ --- │
1794
- │ str ┆ str ┆ list[str] ┆ list[str] │
1795
- ╞═════════════╪═════╪═══════════════════════╪═════════════════════════╡
1796
- │ foo^bar ┆ _ ┆ ["foo^bar"] ┆ ["foo^bar"] │
1797
- │ foo_bar ┆ _ ┆ ["foo", "bar"] ┆ ["foo_", "bar"] │
1798
- │ foo*bar*baz ┆ * ┆ ["foo", "bar", "baz"] ┆ ["foo*", "bar*", "baz"] │
1799
- └─────────────┴─────┴───────────────────────┴─────────────────────────┘
1800
-
1801
- Returns
1802
- -------
1803
- Expr
1804
- Expression of data type :class:`String`.
1805
- """
1806
- by_pyexpr = parse_into_expression(by, str_as_lit=True)
1807
- if inclusive:
1808
- return wrap_expr(self._pyexpr.str_split_inclusive(by_pyexpr))
1809
- return wrap_expr(self._pyexpr.str_split(by_pyexpr))
1810
-
1811
- def split_exact(self, by: IntoExpr, n: int, *, inclusive: bool = False) -> Expr:
1812
- """
1813
- Split the string by a substring using `n` splits.
1814
-
1815
- Results in a struct of `n+1` fields.
1816
-
1817
- If it cannot make `n` splits, the remaining field elements will be null.
1818
-
1819
- Parameters
1820
- ----------
1821
- by
1822
- Substring to split by.
1823
- n
1824
- Number of splits to make.
1825
- inclusive
1826
- If True, include the split character/string in the results.
1827
-
1828
- Returns
1829
- -------
1830
- Expr
1831
- Expression of data type :class:`Struct` with fields of data type
1832
- :class:`String`.
1833
-
1834
- Examples
1835
- --------
1836
- >>> df = pl.DataFrame({"x": ["a_1", None, "c", "d_4"]})
1837
- >>> df.with_columns(
1838
- ... extracted=pl.col("x").str.split_exact("_", 1).alias("fields"),
1839
- ... )
1840
- shape: (4, 2)
1841
- ┌──────┬─────────────┐
1842
- │ x ┆ extracted │
1843
- │ --- ┆ --- │
1844
- │ str ┆ struct[2] │
1845
- ╞══════╪═════════════╡
1846
- │ a_1 ┆ {"a","1"} │
1847
- │ null ┆ {null,null} │
1848
- │ c ┆ {"c",null} │
1849
- │ d_4 ┆ {"d","4"} │
1850
- └──────┴─────────────┘
1851
-
1852
-
1853
- Split string values in column x in exactly 2 parts and assign
1854
- each part to a new column.
1855
-
1856
- >>> df.with_columns(
1857
- ... pl.col("x")
1858
- ... .str.split_exact("_", 1)
1859
- ... .struct.rename_fields(["first_part", "second_part"])
1860
- ... .alias("fields")
1861
- ... ).unnest("fields")
1862
- shape: (4, 3)
1863
- ┌──────┬────────────┬─────────────┐
1864
- │ x ┆ first_part ┆ second_part │
1865
- │ --- ┆ --- ┆ --- │
1866
- │ str ┆ str ┆ str │
1867
- ╞══════╪════════════╪═════════════╡
1868
- │ a_1 ┆ a ┆ 1 │
1869
- │ null ┆ null ┆ null │
1870
- │ c ┆ c ┆ null │
1871
- │ d_4 ┆ d ┆ 4 │
1872
- └──────┴────────────┴─────────────┘
1873
- """
1874
- by_pyexpr = parse_into_expression(by, str_as_lit=True)
1875
- if inclusive:
1876
- return wrap_expr(self._pyexpr.str_split_exact_inclusive(by_pyexpr, n))
1877
- return wrap_expr(self._pyexpr.str_split_exact(by_pyexpr, n))
1878
-
1879
- def splitn(self, by: IntoExpr, n: int) -> Expr:
1880
- """
1881
- Split the string by a substring, restricted to returning at most `n` items.
1882
-
1883
- If the number of possible splits is less than `n-1`, the remaining field
1884
- elements will be null. If the number of possible splits is `n-1` or greater,
1885
- the last (nth) substring will contain the remainder of the string.
1886
-
1887
- Parameters
1888
- ----------
1889
- by
1890
- Substring to split by.
1891
- n
1892
- Max number of items to return.
1893
-
1894
- Returns
1895
- -------
1896
- Expr
1897
- Expression of data type :class:`Struct` with fields of data type
1898
- :class:`String`.
1899
-
1900
- Examples
1901
- --------
1902
- >>> df = pl.DataFrame({"s": ["foo bar", None, "foo-bar", "foo bar baz"]})
1903
- >>> df.with_columns(pl.col("s").str.splitn(" ", 2).alias("fields"))
1904
- shape: (4, 2)
1905
- ┌─────────────┬───────────────────┐
1906
- │ s ┆ fields │
1907
- │ --- ┆ --- │
1908
- │ str ┆ struct[2] │
1909
- ╞═════════════╪═══════════════════╡
1910
- │ foo bar ┆ {"foo","bar"} │
1911
- │ null ┆ {null,null} │
1912
- │ foo-bar ┆ {"foo-bar",null} │
1913
- │ foo bar baz ┆ {"foo","bar baz"} │
1914
- └─────────────┴───────────────────┘
1915
-
1916
- Split string values in column s in exactly 2 parts and assign
1917
- each part to a new column.
1918
-
1919
- >>> df.with_columns(
1920
- ... pl.col("s")
1921
- ... .str.splitn(" ", 2)
1922
- ... .struct.rename_fields(["first_part", "second_part"])
1923
- ... .alias("fields")
1924
- ... ).unnest("fields")
1925
- shape: (4, 3)
1926
- ┌─────────────┬────────────┬─────────────┐
1927
- │ s ┆ first_part ┆ second_part │
1928
- │ --- ┆ --- ┆ --- │
1929
- │ str ┆ str ┆ str │
1930
- ╞═════════════╪════════════╪═════════════╡
1931
- │ foo bar ┆ foo ┆ bar │
1932
- │ null ┆ null ┆ null │
1933
- │ foo-bar ┆ foo-bar ┆ null │
1934
- │ foo bar baz ┆ foo ┆ bar baz │
1935
- └─────────────┴────────────┴─────────────┘
1936
- """
1937
- by_pyexpr = parse_into_expression(by, str_as_lit=True)
1938
- return wrap_expr(self._pyexpr.str_splitn(by_pyexpr, n))
1939
-
1940
- def replace(
1941
- self,
1942
- pattern: str | Expr,
1943
- value: str | Expr,
1944
- *,
1945
- literal: bool = False,
1946
- n: int = 1,
1947
- ) -> Expr:
1948
- r"""
1949
- Replace first matching regex/literal substring with a new string value.
1950
-
1951
- Parameters
1952
- ----------
1953
- pattern
1954
- A valid regular expression pattern, compatible with the `regex crate
1955
- <https://docs.rs/regex/latest/regex/>`_.
1956
- value
1957
- String that will replace the matched substring.
1958
- literal
1959
- Treat `pattern` as a literal string.
1960
- n
1961
- Number of matches to replace.
1962
-
1963
- See Also
1964
- --------
1965
- replace_all
1966
-
1967
- Notes
1968
- -----
1969
- * To modify regular expression behaviour (such as case-sensitivity) with flags,
1970
- use the inline `(?iLmsuxU)` syntax. See the regex crate's section on
1971
- `grouping and flags <https://docs.rs/regex/latest/regex/#grouping-and-flags>`_
1972
- for additional information about the use of inline expression modifiers.
1973
-
1974
- * The dollar sign (`$`) is a special character related to capture groups; if you
1975
- want to replace some target pattern with characters that include a literal `$`
1976
- you should escape it by doubling it up as `$$`, or set `literal=True` if you
1977
- do not need a full regular expression pattern match. Otherwise, you will be
1978
- referencing a (potentially non-existent) capture group.
1979
-
1980
- In the example below we need to double up `$` (to represent a literal dollar
1981
- sign, and then refer to the capture group using `$n` or `${n}`, hence the
1982
- three consecutive `$` characters in the replacement value:
1983
-
1984
- .. code-block:: python
1985
-
1986
- >>> df = pl.DataFrame({"cost": ["#12.34", "#56.78"]})
1987
- >>> df.with_columns(
1988
- ... cost_usd=pl.col("cost").str.replace(r"#(\d+)", "$$${1}")
1989
- ... )
1990
- shape: (2, 2)
1991
- ┌────────┬──────────┐
1992
- │ cost ┆ cost_usd │
1993
- │ --- ┆ --- │
1994
- │ str ┆ str │
1995
- ╞════════╪══════════╡
1996
- │ #12.34 ┆ $12.34 │
1997
- │ #56.78 ┆ $56.78 │
1998
- └────────┴──────────┘
1999
-
2000
- Examples
2001
- --------
2002
- >>> df = pl.DataFrame({"id": [1, 2], "text": ["123abc", "abc456"]})
2003
- >>> df.with_columns(pl.col("text").str.replace(r"abc\b", "ABC"))
2004
- shape: (2, 2)
2005
- ┌─────┬────────┐
2006
- │ id ┆ text │
2007
- │ --- ┆ --- │
2008
- │ i64 ┆ str │
2009
- ╞═════╪════════╡
2010
- │ 1 ┆ 123ABC │
2011
- │ 2 ┆ abc456 │
2012
- └─────┴────────┘
2013
-
2014
- Capture groups are supported. Use `$1` or `${1}` in the `value` string to refer
2015
- to the first capture group in the `pattern`, `$2` or `${2}` to refer to the
2016
- second capture group, and so on. You can also use *named* capture groups.
2017
-
2018
- >>> df = pl.DataFrame({"word": ["hat", "hut"]})
2019
- >>> df.with_columns(
2020
- ... positional=pl.col.word.str.replace("h(.)t", "b${1}d"),
2021
- ... named=pl.col.word.str.replace("h(?<vowel>.)t", "b${vowel}d"),
2022
- ... )
2023
- shape: (2, 3)
2024
- ┌──────┬────────────┬───────┐
2025
- │ word ┆ positional ┆ named │
2026
- │ --- ┆ --- ┆ --- │
2027
- │ str ┆ str ┆ str │
2028
- ╞══════╪════════════╪═══════╡
2029
- │ hat ┆ bad ┆ bad │
2030
- │ hut ┆ bud ┆ bud │
2031
- └──────┴────────────┴───────┘
2032
-
2033
- Apply case-insensitive string replacement using the `(?i)` flag.
2034
-
2035
- >>> df = pl.DataFrame(
2036
- ... {
2037
- ... "city": "Philadelphia",
2038
- ... "season": ["Spring", "Summer", "Autumn", "Winter"],
2039
- ... "weather": ["Rainy", "Sunny", "Cloudy", "Snowy"],
2040
- ... }
2041
- ... )
2042
- >>> df.with_columns(
2043
- ... pl.col("weather").str.replace(r"(?i)foggy|rainy|cloudy|snowy", "Sunny")
2044
- ... )
2045
- shape: (4, 3)
2046
- ┌──────────────┬────────┬─────────┐
2047
- │ city ┆ season ┆ weather │
2048
- │ --- ┆ --- ┆ --- │
2049
- │ str ┆ str ┆ str │
2050
- ╞══════════════╪════════╪═════════╡
2051
- │ Philadelphia ┆ Spring ┆ Sunny │
2052
- │ Philadelphia ┆ Summer ┆ Sunny │
2053
- │ Philadelphia ┆ Autumn ┆ Sunny │
2054
- │ Philadelphia ┆ Winter ┆ Sunny │
2055
- └──────────────┴────────┴─────────┘
2056
- """
2057
- pattern_pyexpr = parse_into_expression(pattern, str_as_lit=True)
2058
- value_pyexpr = parse_into_expression(value, str_as_lit=True)
2059
- return wrap_expr(
2060
- self._pyexpr.str_replace_n(pattern_pyexpr, value_pyexpr, literal, n)
2061
- )
2062
-
2063
- def replace_all(
2064
- self, pattern: str | Expr, value: str | Expr, *, literal: bool = False
2065
- ) -> Expr:
2066
- r"""
2067
- Replace all matching regex/literal substrings with a new string value.
2068
-
2069
- Parameters
2070
- ----------
2071
- pattern
2072
- A valid regular expression pattern, compatible with the `regex crate
2073
- <https://docs.rs/regex/latest/regex/>`_.
2074
- value
2075
- String that will replace the matched substring.
2076
- literal
2077
- Treat `pattern` as a literal string.
2078
-
2079
- See Also
2080
- --------
2081
- replace
2082
-
2083
- Notes
2084
- -----
2085
- * To modify regular expression behaviour (such as case-sensitivity) with flags,
2086
- use the inline `(?iLmsuxU)` syntax. See the regex crate's section on
2087
- `grouping and flags <https://docs.rs/regex/latest/regex/#grouping-and-flags>`_
2088
- for additional information about the use of inline expression modifiers.
2089
-
2090
- * The dollar sign (`$`) is a special character related to capture groups; if you
2091
- want to replace some target pattern with characters that include a literal `$`
2092
- you should escape it by doubling it up as `$$`, or set `literal=True` if you
2093
- do not need a full regular expression pattern match. Otherwise, you will be
2094
- referencing a (potentially non-existent) capture group.
2095
-
2096
- In the example below we need to double up `$` to represent a literal dollar
2097
- sign, otherwise we are referring to a capture group (which may or may not
2098
- exist):
2099
-
2100
- .. code-block:: python
2101
-
2102
- >>> df = pl.DataFrame({"text": ["ab12cd34ef", "gh45ij67kl"]})
2103
- >>> df.with_columns(
2104
- ... # the replacement pattern refers back to the capture group
2105
- ... text1=pl.col("text").str.replace_all(r"(?<N>\d{2,})", "$N$"),
2106
- ... # doubling-up the `$` results in it appearing as a literal value
2107
- ... text2=pl.col("text").str.replace_all(r"(?<N>\d{2,})", "$$N$$"),
2108
- ... )
2109
- shape: (2, 3)
2110
- ┌────────────┬──────────────┬──────────────┐
2111
- │ text ┆ text1 ┆ text2 │
2112
- │ --- ┆ --- ┆ --- │
2113
- │ str ┆ str ┆ str │
2114
- ╞════════════╪══════════════╪══════════════╡
2115
- │ ab12cd34ef ┆ ab12$cd34$ef ┆ ab$N$cd$N$ef │
2116
- │ gh45ij67kl ┆ gh45$ij67$kl ┆ gh$N$ij$N$kl │
2117
- └────────────┴──────────────┴──────────────┘
2118
-
2119
- Examples
2120
- --------
2121
- >>> df = pl.DataFrame({"id": [1, 2], "text": ["abcabc", "123a123"]})
2122
- >>> df.with_columns(pl.col("text").str.replace_all("a", "-"))
2123
- shape: (2, 2)
2124
- ┌─────┬─────────┐
2125
- │ id ┆ text │
2126
- │ --- ┆ --- │
2127
- │ i64 ┆ str │
2128
- ╞═════╪═════════╡
2129
- │ 1 ┆ -bc-bc │
2130
- │ 2 ┆ 123-123 │
2131
- └─────┴─────────┘
2132
-
2133
- Capture groups are supported. Use `$1` or `${1}` in the `value` string to refer
2134
- to the first capture group in the `pattern`, `$2` or `${2}` to refer to the
2135
- second capture group, and so on. You can also use *named* capture groups.
2136
-
2137
- >>> df = pl.DataFrame({"word": ["hat", "hut"]})
2138
- >>> df.with_columns(
2139
- ... positional=pl.col.word.str.replace_all("h(.)t", "b${1}d"),
2140
- ... named=pl.col.word.str.replace_all("h(?<vowel>.)t", "b${vowel}d"),
2141
- ... )
2142
- shape: (2, 3)
2143
- ┌──────┬────────────┬───────┐
2144
- │ word ┆ positional ┆ named │
2145
- │ --- ┆ --- ┆ --- │
2146
- │ str ┆ str ┆ str │
2147
- ╞══════╪════════════╪═══════╡
2148
- │ hat ┆ bad ┆ bad │
2149
- │ hut ┆ bud ┆ bud │
2150
- └──────┴────────────┴───────┘
2151
-
2152
- Apply case-insensitive string replacement using the `(?i)` flag.
2153
-
2154
- >>> df = pl.DataFrame(
2155
- ... {
2156
- ... "city": "Philadelphia",
2157
- ... "season": ["Spring", "Summer", "Autumn", "Winter"],
2158
- ... "weather": ["Rainy", "Sunny", "Cloudy", "Snowy"],
2159
- ... }
2160
- ... )
2161
- >>> df.with_columns(
2162
- ... # apply case-insensitive string replacement
2163
- ... pl.col("weather").str.replace_all(
2164
- ... r"(?i)foggy|rainy|cloudy|snowy", "Sunny"
2165
- ... )
2166
- ... )
2167
- shape: (4, 3)
2168
- ┌──────────────┬────────┬─────────┐
2169
- │ city ┆ season ┆ weather │
2170
- │ --- ┆ --- ┆ --- │
2171
- │ str ┆ str ┆ str │
2172
- ╞══════════════╪════════╪═════════╡
2173
- │ Philadelphia ┆ Spring ┆ Sunny │
2174
- │ Philadelphia ┆ Summer ┆ Sunny │
2175
- │ Philadelphia ┆ Autumn ┆ Sunny │
2176
- │ Philadelphia ┆ Winter ┆ Sunny │
2177
- └──────────────┴────────┴─────────┘
2178
- """
2179
- pattern_pyexpr = parse_into_expression(pattern, str_as_lit=True)
2180
- value_pyexpr = parse_into_expression(value, str_as_lit=True)
2181
- return wrap_expr(
2182
- self._pyexpr.str_replace_all(pattern_pyexpr, value_pyexpr, literal)
2183
- )
2184
-
2185
- def reverse(self) -> Expr:
2186
- """
2187
- Returns string values in reversed order.
2188
-
2189
- Examples
2190
- --------
2191
- >>> df = pl.DataFrame({"text": ["foo", "bar", "man\u0303ana"]})
2192
- >>> df.with_columns(pl.col("text").str.reverse().alias("reversed"))
2193
- shape: (3, 2)
2194
- ┌────────┬──────────┐
2195
- │ text ┆ reversed │
2196
- │ --- ┆ --- │
2197
- │ str ┆ str │
2198
- ╞════════╪══════════╡
2199
- │ foo ┆ oof │
2200
- │ bar ┆ rab │
2201
- │ mañana ┆ anañam │
2202
- └────────┴──────────┘
2203
- """
2204
- return wrap_expr(self._pyexpr.str_reverse())
2205
-
2206
- def slice(
2207
- self, offset: int | IntoExprColumn, length: int | IntoExprColumn | None = None
2208
- ) -> Expr:
2209
- """
2210
- Extract a substring from each string value.
2211
-
2212
- Parameters
2213
- ----------
2214
- offset
2215
- Start index. Negative indexing is supported.
2216
- length
2217
- Length of the slice. If set to `None` (default), the slice is taken to the
2218
- end of the string.
2219
-
2220
- Returns
2221
- -------
2222
- Expr
2223
- Expression of data type :class:`String`.
2224
-
2225
- Notes
2226
- -----
2227
- Both the `offset` and `length` inputs are defined in terms of the number
2228
- of characters in the (UTF8) string. A character is defined as a
2229
- `Unicode scalar value`_. A single character is represented by a single byte
2230
- when working with ASCII text, and a maximum of 4 bytes otherwise.
2231
-
2232
- .. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value
2233
-
2234
- Examples
2235
- --------
2236
- >>> df = pl.DataFrame({"s": ["pear", None, "papaya", "dragonfruit"]})
2237
- >>> df.with_columns(pl.col("s").str.slice(-3).alias("slice"))
2238
- shape: (4, 2)
2239
- ┌─────────────┬───────┐
2240
- │ s ┆ slice │
2241
- │ --- ┆ --- │
2242
- │ str ┆ str │
2243
- ╞═════════════╪═══════╡
2244
- │ pear ┆ ear │
2245
- │ null ┆ null │
2246
- │ papaya ┆ aya │
2247
- │ dragonfruit ┆ uit │
2248
- └─────────────┴───────┘
2249
-
2250
- Using the optional `length` parameter
2251
-
2252
- >>> df.with_columns(pl.col("s").str.slice(4, length=3).alias("slice"))
2253
- shape: (4, 2)
2254
- ┌─────────────┬───────┐
2255
- │ s ┆ slice │
2256
- │ --- ┆ --- │
2257
- │ str ┆ str │
2258
- ╞═════════════╪═══════╡
2259
- │ pear ┆ │
2260
- │ null ┆ null │
2261
- │ papaya ┆ ya │
2262
- │ dragonfruit ┆ onf │
2263
- └─────────────┴───────┘
2264
- """
2265
- offset_pyexpr = parse_into_expression(offset)
2266
- length_pyexpr = parse_into_expression(length)
2267
- return wrap_expr(self._pyexpr.str_slice(offset_pyexpr, length_pyexpr))
2268
-
2269
- def head(self, n: int | IntoExprColumn) -> Expr:
2270
- """
2271
- Return the first n characters of each string in a String Series.
2272
-
2273
- Parameters
2274
- ----------
2275
- n
2276
- Length of the slice (integer or expression). Negative indexing is supported;
2277
- see note (2) below.
2278
-
2279
- Returns
2280
- -------
2281
- Expr
2282
- Expression of data type :class:`String`.
2283
-
2284
- Notes
2285
- -----
2286
- 1) The `n` input is defined in terms of the number of characters in the (UTF8)
2287
- string. A character is defined as a `Unicode scalar value`_. A single
2288
- character is represented by a single byte when working with ASCII text, and a
2289
- maximum of 4 bytes otherwise.
2290
-
2291
- .. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value
2292
-
2293
- 2) When the `n` input is negative, `head` returns characters up to the `n`th
2294
- from the end of the string. For example, if `n = -3`, then all characters
2295
- except the last three are returned.
2296
-
2297
- 3) If the length of the string has fewer than `n` characters, the full string is
2298
- returned.
2299
-
2300
- Examples
2301
- --------
2302
- Return up to the first 5 characters:
2303
-
2304
- >>> df = pl.DataFrame({"s": ["pear", None, "papaya", "dragonfruit"]})
2305
- >>> df.with_columns(pl.col("s").str.head(5).alias("s_head_5"))
2306
- shape: (4, 2)
2307
- ┌─────────────┬──────────┐
2308
- │ s ┆ s_head_5 │
2309
- │ --- ┆ --- │
2310
- │ str ┆ str │
2311
- ╞═════════════╪══════════╡
2312
- │ pear ┆ pear │
2313
- │ null ┆ null │
2314
- │ papaya ┆ papay │
2315
- │ dragonfruit ┆ drago │
2316
- └─────────────┴──────────┘
2317
-
2318
- Return characters determined by column `n`:
2319
-
2320
- >>> df = pl.DataFrame(
2321
- ... {
2322
- ... "s": ["pear", None, "papaya", "dragonfruit"],
2323
- ... "n": [3, 4, -2, -5],
2324
- ... }
2325
- ... )
2326
- >>> df.with_columns(pl.col("s").str.head("n").alias("s_head_n"))
2327
- shape: (4, 3)
2328
- ┌─────────────┬─────┬──────────┐
2329
- │ s ┆ n ┆ s_head_n │
2330
- │ --- ┆ --- ┆ --- │
2331
- │ str ┆ i64 ┆ str │
2332
- ╞═════════════╪═════╪══════════╡
2333
- │ pear ┆ 3 ┆ pea │
2334
- │ null ┆ 4 ┆ null │
2335
- │ papaya ┆ -2 ┆ papa │
2336
- │ dragonfruit ┆ -5 ┆ dragon │
2337
- └─────────────┴─────┴──────────┘
2338
- """
2339
- n_pyexpr = parse_into_expression(n)
2340
- return wrap_expr(self._pyexpr.str_head(n_pyexpr))
2341
-
2342
- def tail(self, n: int | IntoExprColumn) -> Expr:
2343
- """
2344
- Return the last n characters of each string in a String Series.
2345
-
2346
- Parameters
2347
- ----------
2348
- n
2349
- Length of the slice (integer or expression). Negative indexing is supported;
2350
- see note (2) below.
2351
-
2352
- Returns
2353
- -------
2354
- Expr
2355
- Expression of data type :class:`String`.
2356
-
2357
- Notes
2358
- -----
2359
- 1) The `n` input is defined in terms of the number of characters in the (UTF8)
2360
- string. A character is defined as a `Unicode scalar value`_. A single
2361
- character is represented by a single byte when working with ASCII text, and a
2362
- maximum of 4 bytes otherwise.
2363
-
2364
- .. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value
2365
-
2366
- 2) When the `n` input is negative, `tail` returns characters starting from the
2367
- `n`th from the beginning of the string. For example, if `n = -3`, then all
2368
- characters except the first three are returned.
2369
-
2370
- 3) If the length of the string has fewer than `n` characters, the full string is
2371
- returned.
2372
-
2373
- Examples
2374
- --------
2375
- Return up to the last 5 characters:
2376
-
2377
- >>> df = pl.DataFrame({"s": ["pear", None, "papaya", "dragonfruit"]})
2378
- >>> df.with_columns(pl.col("s").str.tail(5).alias("s_tail_5"))
2379
- shape: (4, 2)
2380
- ┌─────────────┬──────────┐
2381
- │ s ┆ s_tail_5 │
2382
- │ --- ┆ --- │
2383
- │ str ┆ str │
2384
- ╞═════════════╪══════════╡
2385
- │ pear ┆ pear │
2386
- │ null ┆ null │
2387
- │ papaya ┆ apaya │
2388
- │ dragonfruit ┆ fruit │
2389
- └─────────────┴──────────┘
2390
-
2391
- Return characters determined by column `n`:
2392
-
2393
- >>> df = pl.DataFrame(
2394
- ... {
2395
- ... "s": ["pear", None, "papaya", "dragonfruit"],
2396
- ... "n": [3, 4, -2, -5],
2397
- ... }
2398
- ... )
2399
- >>> df.with_columns(pl.col("s").str.tail("n").alias("s_tail_n"))
2400
- shape: (4, 3)
2401
- ┌─────────────┬─────┬──────────┐
2402
- │ s ┆ n ┆ s_tail_n │
2403
- │ --- ┆ --- ┆ --- │
2404
- │ str ┆ i64 ┆ str │
2405
- ╞═════════════╪═════╪══════════╡
2406
- │ pear ┆ 3 ┆ ear │
2407
- │ null ┆ 4 ┆ null │
2408
- │ papaya ┆ -2 ┆ paya │
2409
- │ dragonfruit ┆ -5 ┆ nfruit │
2410
- └─────────────┴─────┴──────────┘
2411
- """
2412
- n_pyexpr = parse_into_expression(n)
2413
- return wrap_expr(self._pyexpr.str_tail(n_pyexpr))
2414
-
2415
- @deprecated(
2416
- '`str.explode` is deprecated; use `str.split("").explode()` instead.'
2417
- " Note that empty strings will result in null instead of being preserved."
2418
- " To get the exact same behavior, split first and then use a `pl.when...then...otherwise`"
2419
- " expression to handle the empty list before exploding."
2420
- )
2421
- def explode(self) -> Expr:
2422
- """
2423
- Returns a column with a separate row for every string character.
2424
-
2425
- .. deprecated:: 0.20.31
2426
- Use the `.str.split("").explode()` method instead. Note that empty strings
2427
- will result in null instead of being preserved. To get the exact same
2428
- behavior, split first and then use a `pl.when...then...otherwise`
2429
- expression to handle the empty list before exploding.
2430
-
2431
- Returns
2432
- -------
2433
- Expr
2434
- Expression of data type :class:`String`.
2435
-
2436
- Examples
2437
- --------
2438
- >>> df = pl.DataFrame({"a": ["foo", "bar"]})
2439
- >>> df.select(pl.col("a").str.explode()) # doctest: +SKIP
2440
- shape: (6, 1)
2441
- ┌─────┐
2442
- │ a │
2443
- │ --- │
2444
- │ str │
2445
- ╞═════╡
2446
- │ f │
2447
- │ o │
2448
- │ o │
2449
- │ b │
2450
- │ a │
2451
- │ r │
2452
- └─────┘
2453
- """
2454
- split = self.split("")
2455
- return F.when(split.ne_missing([])).then(split).otherwise([""]).explode()
2456
-
2457
- def to_integer(
2458
- self,
2459
- *,
2460
- base: int | IntoExprColumn = 10,
2461
- dtype: PolarsIntegerType = Int64,
2462
- strict: bool = True,
2463
- ) -> Expr:
2464
- """
2465
- Convert a String column into an Int64 column with base radix.
2466
-
2467
- Parameters
2468
- ----------
2469
- base
2470
- Positive integer or expression which is the base of the string
2471
- we are parsing.
2472
- Default: 10.
2473
- strict
2474
- Bool, Default=True will raise any ParseError or overflow as ComputeError.
2475
- False silently convert to Null.
2476
-
2477
- Returns
2478
- -------
2479
- Expr
2480
- Expression of data type :class:`Int64`.
2481
-
2482
- Examples
2483
- --------
2484
- >>> df = pl.DataFrame({"bin": ["110", "101", "010", "invalid"]})
2485
- >>> df.with_columns(
2486
- ... parsed=pl.col("bin").str.to_integer(
2487
- ... base=2, dtype=pl.Int32, strict=False
2488
- ... )
2489
- ... )
2490
- shape: (4, 2)
2491
- ┌─────────┬────────┐
2492
- │ bin ┆ parsed │
2493
- │ --- ┆ --- │
2494
- │ str ┆ i32 │
2495
- ╞═════════╪════════╡
2496
- │ 110 ┆ 6 │
2497
- │ 101 ┆ 5 │
2498
- │ 010 ┆ 2 │
2499
- │ invalid ┆ null │
2500
- └─────────┴────────┘
2501
-
2502
- >>> df = pl.DataFrame({"hex": ["fa1e", "ff00", "cafe", None]})
2503
- >>> df.with_columns(parsed=pl.col("hex").str.to_integer(base=16, strict=True))
2504
- shape: (4, 2)
2505
- ┌──────┬────────┐
2506
- │ hex ┆ parsed │
2507
- │ --- ┆ --- │
2508
- │ str ┆ i64 │
2509
- ╞══════╪════════╡
2510
- │ fa1e ┆ 64030 │
2511
- │ ff00 ┆ 65280 │
2512
- │ cafe ┆ 51966 │
2513
- │ null ┆ null │
2514
- └──────┴────────┘
2515
- """
2516
- base_pyexpr = parse_into_expression(base, str_as_lit=False)
2517
- return wrap_expr(self._pyexpr.str_to_integer(base_pyexpr, dtype, strict))
2518
-
2519
- def contains_any(
2520
- self, patterns: IntoExpr, *, ascii_case_insensitive: bool = False
2521
- ) -> Expr:
2522
- """
2523
- Use the Aho-Corasick algorithm to find matches.
2524
-
2525
- Determines if any of the patterns are contained in the string.
2526
-
2527
- Parameters
2528
- ----------
2529
- patterns
2530
- String patterns to search.
2531
- ascii_case_insensitive
2532
- Enable ASCII-aware case-insensitive matching.
2533
- When this option is enabled, searching will be performed without respect
2534
- to case for ASCII letters (a-z and A-Z) only.
2535
-
2536
- Notes
2537
- -----
2538
- This method supports matching on string literals only, and does not support
2539
- regular expression matching.
2540
-
2541
- Examples
2542
- --------
2543
- >>> _ = pl.Config.set_fmt_str_lengths(100)
2544
- >>> df = pl.DataFrame(
2545
- ... {
2546
- ... "lyrics": [
2547
- ... "Everybody wants to rule the world",
2548
- ... "Tell me what you want, what you really really want",
2549
- ... "Can you feel the love tonight",
2550
- ... ]
2551
- ... }
2552
- ... )
2553
- >>> df.with_columns(
2554
- ... pl.col("lyrics").str.contains_any(["you", "me"]).alias("contains_any")
2555
- ... )
2556
- shape: (3, 2)
2557
- ┌────────────────────────────────────────────────────┬──────────────┐
2558
- │ lyrics ┆ contains_any │
2559
- │ --- ┆ --- │
2560
- │ str ┆ bool │
2561
- ╞════════════════════════════════════════════════════╪══════════════╡
2562
- │ Everybody wants to rule the world ┆ false │
2563
- │ Tell me what you want, what you really really want ┆ true │
2564
- │ Can you feel the love tonight ┆ true │
2565
- └────────────────────────────────────────────────────┴──────────────┘
2566
- """
2567
- patterns_pyexpr = parse_into_expression(patterns, str_as_lit=False)
2568
- return wrap_expr(
2569
- self._pyexpr.str_contains_any(patterns_pyexpr, ascii_case_insensitive)
2570
- )
2571
-
2572
- def replace_many(
2573
- self,
2574
- patterns: IntoExpr | Mapping[str, str],
2575
- replace_with: IntoExpr | NoDefault = no_default,
2576
- *,
2577
- ascii_case_insensitive: bool = False,
2578
- ) -> Expr:
2579
- """
2580
- Use the Aho-Corasick algorithm to replace many matches.
2581
-
2582
- Parameters
2583
- ----------
2584
- patterns
2585
- String patterns to search and replace.
2586
- Accepts expression input. Strings are parsed as column names, and other
2587
- non-expression inputs are parsed as literals. Also accepts a mapping of
2588
- patterns to their replacement as syntactic sugar for
2589
- `replace_many(pl.Series(mapping.keys()), pl.Series(mapping.values()))`.
2590
- replace_with
2591
- Strings to replace where a pattern was a match.
2592
- Accepts expression input. Non-expression inputs are parsed as literals.
2593
- Length must match the length of `patterns` or have length 1. This can be
2594
- broadcasted, so it supports many:one and many:many.
2595
- ascii_case_insensitive
2596
- Enable ASCII-aware case-insensitive matching.
2597
- When this option is enabled, searching will be performed without respect
2598
- to case for ASCII letters (a-z and A-Z) only.
2599
-
2600
- Notes
2601
- -----
2602
- This method supports matching on string literals only, and does not support
2603
- regular expression matching.
2604
-
2605
- Examples
2606
- --------
2607
- Replace many patterns by passing sequences of equal length to the `patterns` and
2608
- `replace_with` parameters.
2609
-
2610
- >>> _ = pl.Config.set_fmt_str_lengths(100)
2611
- >>> _ = pl.Config.set_tbl_width_chars(110)
2612
- >>> df = pl.DataFrame(
2613
- ... {
2614
- ... "lyrics": [
2615
- ... "Everybody wants to rule the world",
2616
- ... "Tell me what you want, what you really really want",
2617
- ... "Can you feel the love tonight",
2618
- ... ]
2619
- ... }
2620
- ... )
2621
- >>> df.with_columns(
2622
- ... pl.col("lyrics")
2623
- ... .str.replace_many(
2624
- ... ["me", "you"],
2625
- ... ["you", "me"],
2626
- ... )
2627
- ... .alias("confusing")
2628
- ... )
2629
- shape: (3, 2)
2630
- ┌────────────────────────────────────────────────────┬───────────────────────────────────────────────────┐
2631
- │ lyrics ┆ confusing │
2632
- │ --- ┆ --- │
2633
- │ str ┆ str │
2634
- ╞════════════════════════════════════════════════════╪═══════════════════════════════════════════════════╡
2635
- │ Everybody wants to rule the world ┆ Everybody wants to rule the world │
2636
- │ Tell me what you want, what you really really want ┆ Tell you what me want, what me really really want │
2637
- │ Can you feel the love tonight ┆ Can me feel the love tonight │
2638
- └────────────────────────────────────────────────────┴───────────────────────────────────────────────────┘
2639
-
2640
- Broadcast a replacement for many patterns by passing sequence of length 1 to the
2641
- `replace_with` parameter.
2642
-
2643
- >>> _ = pl.Config.set_fmt_str_lengths(100)
2644
- >>> df = pl.DataFrame(
2645
- ... {
2646
- ... "lyrics": [
2647
- ... "Everybody wants to rule the world",
2648
- ... "Tell me what you want, what you really really want",
2649
- ... "Can you feel the love tonight",
2650
- ... ]
2651
- ... }
2652
- ... )
2653
- >>> df.with_columns(
2654
- ... pl.col("lyrics")
2655
- ... .str.replace_many(
2656
- ... ["me", "you", "they"],
2657
- ... [""],
2658
- ... )
2659
- ... .alias("removes_pronouns")
2660
- ... )
2661
- shape: (3, 2)
2662
- ┌────────────────────────────────────────────────────┬────────────────────────────────────────────┐
2663
- │ lyrics ┆ removes_pronouns │
2664
- │ --- ┆ --- │
2665
- │ str ┆ str │
2666
- ╞════════════════════════════════════════════════════╪════════════════════════════════════════════╡
2667
- │ Everybody wants to rule the world ┆ Everybody wants to rule the world │
2668
- │ Tell me what you want, what you really really want ┆ Tell what want, what really really want │
2669
- │ Can you feel the love tonight ┆ Can feel the love tonight │
2670
- └────────────────────────────────────────────────────┴────────────────────────────────────────────┘
2671
-
2672
- Passing a mapping with patterns and replacements is also supported as syntactic
2673
- sugar.
2674
-
2675
- >>> _ = pl.Config.set_fmt_str_lengths(100)
2676
- >>> _ = pl.Config.set_tbl_width_chars(110)
2677
- >>> df = pl.DataFrame(
2678
- ... {
2679
- ... "lyrics": [
2680
- ... "Everybody wants to rule the world",
2681
- ... "Tell me what you want, what you really really want",
2682
- ... "Can you feel the love tonight",
2683
- ... ]
2684
- ... }
2685
- ... )
2686
- >>> mapping = {"me": "you", "you": "me", "want": "need"}
2687
- >>> df.with_columns(
2688
- ... pl.col("lyrics").str.replace_many(mapping).alias("confusing")
2689
- ... )
2690
- shape: (3, 2)
2691
- ┌────────────────────────────────────────────────────┬───────────────────────────────────────────────────┐
2692
- │ lyrics ┆ confusing │
2693
- │ --- ┆ --- │
2694
- │ str ┆ str │
2695
- ╞════════════════════════════════════════════════════╪═══════════════════════════════════════════════════╡
2696
- │ Everybody wants to rule the world ┆ Everybody needs to rule the world │
2697
- │ Tell me what you want, what you really really want ┆ Tell you what me need, what me really really need │
2698
- │ Can you feel the love tonight ┆ Can me feel the love tonight │
2699
- └────────────────────────────────────────────────────┴───────────────────────────────────────────────────┘
2700
- """ # noqa: W505
2701
- if replace_with is no_default:
2702
- if not isinstance(patterns, Mapping):
2703
- msg = "`replace_with` argument is required if `patterns` argument is not a Mapping type"
2704
- raise TypeError(msg)
2705
- # Early return in case of an empty mapping.
2706
- if not patterns:
2707
- return wrap_expr(self._pyexpr)
2708
- replace_with = list(patterns.values())
2709
- patterns = list(patterns.keys())
2710
-
2711
- patterns_pyexpr = parse_into_expression(
2712
- patterns, # type: ignore[arg-type]
2713
- str_as_lit=False,
2714
- )
2715
- replace_with_pyexpr = parse_into_expression(replace_with, str_as_lit=True)
2716
- return wrap_expr(
2717
- self._pyexpr.str_replace_many(
2718
- patterns_pyexpr, replace_with_pyexpr, ascii_case_insensitive
2719
- )
2720
- )
2721
-
2722
- @unstable()
2723
- def extract_many(
2724
- self,
2725
- patterns: IntoExpr,
2726
- *,
2727
- ascii_case_insensitive: bool = False,
2728
- overlapping: bool = False,
2729
- ) -> Expr:
2730
- """
2731
- Use the Aho-Corasick algorithm to extract many matches.
2732
-
2733
- Parameters
2734
- ----------
2735
- patterns
2736
- String patterns to search.
2737
- ascii_case_insensitive
2738
- Enable ASCII-aware case-insensitive matching.
2739
- When this option is enabled, searching will be performed without respect
2740
- to case for ASCII letters (a-z and A-Z) only.
2741
- overlapping
2742
- Whether matches may overlap.
2743
-
2744
- Notes
2745
- -----
2746
- This method supports matching on string literals only, and does not support
2747
- regular expression matching.
2748
-
2749
- Examples
2750
- --------
2751
- >>> _ = pl.Config.set_fmt_str_lengths(100)
2752
- >>> df = pl.DataFrame({"values": ["discontent"]})
2753
- >>> patterns = ["winter", "disco", "onte", "discontent"]
2754
- >>> df.with_columns(
2755
- ... pl.col("values")
2756
- ... .str.extract_many(patterns, overlapping=False)
2757
- ... .alias("matches"),
2758
- ... pl.col("values")
2759
- ... .str.extract_many(patterns, overlapping=True)
2760
- ... .alias("matches_overlapping"),
2761
- ... )
2762
- shape: (1, 3)
2763
- ┌────────────┬───────────┬─────────────────────────────────┐
2764
- │ values ┆ matches ┆ matches_overlapping │
2765
- │ --- ┆ --- ┆ --- │
2766
- │ str ┆ list[str] ┆ list[str] │
2767
- ╞════════════╪═══════════╪═════════════════════════════════╡
2768
- │ discontent ┆ ["disco"] ┆ ["disco", "onte", "discontent"] │
2769
- └────────────┴───────────┴─────────────────────────────────┘
2770
- >>> df = pl.DataFrame(
2771
- ... {
2772
- ... "values": ["discontent", "rhapsody"],
2773
- ... "patterns": [
2774
- ... ["winter", "disco", "onte", "discontent"],
2775
- ... ["rhap", "ody", "coalesce"],
2776
- ... ],
2777
- ... }
2778
- ... )
2779
- >>> df.select(pl.col("values").str.extract_many("patterns"))
2780
- shape: (2, 1)
2781
- ┌─────────────────┐
2782
- │ values │
2783
- │ --- │
2784
- │ list[str] │
2785
- ╞═════════════════╡
2786
- │ ["disco"] │
2787
- │ ["rhap", "ody"] │
2788
- └─────────────────┘
2789
- """
2790
- patterns_pyexpr = parse_into_expression(patterns, str_as_lit=False)
2791
- return wrap_expr(
2792
- self._pyexpr.str_extract_many(
2793
- patterns_pyexpr, ascii_case_insensitive, overlapping
2794
- )
2795
- )
2796
-
2797
- @unstable()
2798
- def find_many(
2799
- self,
2800
- patterns: IntoExpr,
2801
- *,
2802
- ascii_case_insensitive: bool = False,
2803
- overlapping: bool = False,
2804
- ) -> Expr:
2805
- """
2806
- Use the Aho-Corasick algorithm to find many matches.
2807
-
2808
- The function will return the bytes offset of the start of each match.
2809
- The return type will be `List<UInt32>`
2810
-
2811
- Parameters
2812
- ----------
2813
- patterns
2814
- String patterns to search.
2815
- ascii_case_insensitive
2816
- Enable ASCII-aware case-insensitive matching.
2817
- When this option is enabled, searching will be performed without respect
2818
- to case for ASCII letters (a-z and A-Z) only.
2819
- overlapping
2820
- Whether matches may overlap.
2821
-
2822
- Notes
2823
- -----
2824
- This method supports matching on string literals only, and does not support
2825
- regular expression matching.
2826
-
2827
- Examples
2828
- --------
2829
- >>> _ = pl.Config.set_fmt_str_lengths(100)
2830
- >>> df = pl.DataFrame({"values": ["discontent"]})
2831
- >>> patterns = ["winter", "disco", "onte", "discontent"]
2832
- >>> df.with_columns(
2833
- ... pl.col("values")
2834
- ... .str.find_many(patterns, overlapping=False)
2835
- ... .alias("matches"),
2836
- ... pl.col("values")
2837
- ... .str.find_many(patterns, overlapping=True)
2838
- ... .alias("matches_overlapping"),
2839
- ... )
2840
- shape: (1, 3)
2841
- ┌────────────┬───────────┬─────────────────────┐
2842
- │ values ┆ matches ┆ matches_overlapping │
2843
- │ --- ┆ --- ┆ --- │
2844
- │ str ┆ list[u32] ┆ list[u32] │
2845
- ╞════════════╪═══════════╪═════════════════════╡
2846
- │ discontent ┆ [0] ┆ [0, 4, 0] │
2847
- └────────────┴───────────┴─────────────────────┘
2848
- >>> df = pl.DataFrame(
2849
- ... {
2850
- ... "values": ["discontent", "rhapsody"],
2851
- ... "patterns": [
2852
- ... ["winter", "disco", "onte", "discontent"],
2853
- ... ["rhap", "ody", "coalesce"],
2854
- ... ],
2855
- ... }
2856
- ... )
2857
- >>> df.select(pl.col("values").str.find_many("patterns"))
2858
- shape: (2, 1)
2859
- ┌───────────┐
2860
- │ values │
2861
- │ --- │
2862
- │ list[u32] │
2863
- ╞═══════════╡
2864
- │ [0] │
2865
- │ [0, 5] │
2866
- └───────────┘
2867
- """
2868
- patterns_pyexpr = parse_into_expression(patterns, str_as_lit=False)
2869
- return wrap_expr(
2870
- self._pyexpr.str_find_many(
2871
- patterns_pyexpr, ascii_case_insensitive, overlapping
2872
- )
2873
- )
2874
-
2875
- def join(self, delimiter: str = "", *, ignore_nulls: bool = True) -> Expr:
2876
- """
2877
- Vertically concatenate the string values in the column to a single string value.
2878
-
2879
- Parameters
2880
- ----------
2881
- delimiter
2882
- The delimiter to insert between consecutive string values.
2883
- ignore_nulls
2884
- Ignore null values (default).
2885
- If set to `False`, null values will be propagated. This means that
2886
- if the column contains any null values, the output is null.
2887
-
2888
- Returns
2889
- -------
2890
- Expr
2891
- Expression of data type :class:`String`.
2892
-
2893
- Examples
2894
- --------
2895
- >>> df = pl.DataFrame({"foo": [1, None, 3]})
2896
- >>> df.select(pl.col("foo").str.join("-"))
2897
- shape: (1, 1)
2898
- ┌─────┐
2899
- │ foo │
2900
- │ --- │
2901
- │ str │
2902
- ╞═════╡
2903
- │ 1-3 │
2904
- └─────┘
2905
- >>> df.select(pl.col("foo").str.join(ignore_nulls=False))
2906
- shape: (1, 1)
2907
- ┌──────┐
2908
- │ foo │
2909
- │ --- │
2910
- │ str │
2911
- ╞══════╡
2912
- │ null │
2913
- └──────┘
2914
- """
2915
- return wrap_expr(self._pyexpr.str_join(delimiter, ignore_nulls=ignore_nulls))
2916
-
2917
- @deprecated(
2918
- "`str.concat` is deprecated; use `str.join` instead. Note also that the "
2919
- "default `delimiter` for `str.join` is an empty string, not a hyphen."
2920
- )
2921
- def concat(
2922
- self, delimiter: str | None = None, *, ignore_nulls: bool = True
2923
- ) -> Expr:
2924
- """
2925
- Vertically concatenate the string values in the column to a single string value.
2926
-
2927
- .. deprecated:: 1.0.0
2928
- Use :meth:`join` instead. Note that the default `delimiter` for :meth:`join`
2929
- is an empty string instead of a hyphen.
2930
-
2931
- Parameters
2932
- ----------
2933
- delimiter
2934
- The delimiter to insert between consecutive string values.
2935
- ignore_nulls
2936
- Ignore null values (default).
2937
- If set to `False`, null values will be propagated. This means that
2938
- if the column contains any null values, the output is null.
2939
-
2940
- Returns
2941
- -------
2942
- Expr
2943
- Expression of data type :class:`String`.
2944
-
2945
- Examples
2946
- --------
2947
- >>> df = pl.DataFrame({"foo": [1, None, 2]})
2948
- >>> df.select(pl.col("foo").str.concat("-")) # doctest: +SKIP
2949
- shape: (1, 1)
2950
- ┌─────┐
2951
- │ foo │
2952
- │ --- │
2953
- │ str │
2954
- ╞═════╡
2955
- │ 1-2 │
2956
- └─────┘
2957
- >>> df.select(
2958
- ... pl.col("foo").str.concat("-", ignore_nulls=False)
2959
- ... ) # doctest: +SKIP
2960
- shape: (1, 1)
2961
- ┌──────┐
2962
- │ foo │
2963
- │ --- │
2964
- │ str │
2965
- ╞══════╡
2966
- │ null │
2967
- └──────┘
2968
- """
2969
- if delimiter is None:
2970
- delimiter = "-"
2971
- return self.join(delimiter, ignore_nulls=ignore_nulls)
2972
-
2973
- def escape_regex(self) -> Expr:
2974
- r"""
2975
- Returns string values with all regular expression meta characters escaped.
2976
-
2977
- Examples
2978
- --------
2979
- >>> df = pl.DataFrame({"text": ["abc", "def", None, "abc(\\w+)"]})
2980
- >>> df.with_columns(pl.col("text").str.escape_regex().alias("escaped"))
2981
- shape: (4, 2)
2982
- ┌──────────┬──────────────┐
2983
- │ text ┆ escaped │
2984
- │ --- ┆ --- │
2985
- │ str ┆ str │
2986
- ╞══════════╪══════════════╡
2987
- │ abc ┆ abc │
2988
- │ def ┆ def │
2989
- │ null ┆ null │
2990
- │ abc(\w+) ┆ abc\(\\w\+\) │
2991
- └──────────┴──────────────┘
2992
- """
2993
- return wrap_expr(self._pyexpr.str_escape_regex())
2994
-
2995
- def normalize(self, form: UnicodeForm = "NFC") -> Expr:
2996
- """
2997
- Returns the Unicode normal form of the string values.
2998
-
2999
- This uses the forms described in Unicode Standard Annex 15: <https://www.unicode.org/reports/tr15/>.
3000
-
3001
- Parameters
3002
- ----------
3003
- form : {'NFC', 'NFKC', 'NFD', 'NFKD'}
3004
- Unicode form to use.
3005
-
3006
- Examples
3007
- --------
3008
- >>> df = pl.DataFrame({"text": ["01²", "KADOKAWA"]})
3009
- >>> new = df.with_columns(
3010
- ... nfc=pl.col("text").str.normalize("NFC"),
3011
- ... nfkc=pl.col("text").str.normalize("NFKC"),
3012
- ... )
3013
- >>> new
3014
- shape: (2, 3)
3015
- ┌──────────────────┬──────────────────┬──────────┐
3016
- │ text ┆ nfc ┆ nfkc │
3017
- │ --- ┆ --- ┆ --- │
3018
- │ str ┆ str ┆ str │
3019
- ╞══════════════════╪══════════════════╪══════════╡
3020
- │ 01² ┆ 01² ┆ 012 │
3021
- │ KADOKAWA ┆ KADOKAWA ┆ KADOKAWA │
3022
- └──────────────────┴──────────────────┴──────────┘
3023
- >>> new.select(pl.all().str.len_bytes())
3024
- shape: (2, 3)
3025
- ┌──────┬─────┬──────┐
3026
- │ text ┆ nfc ┆ nfkc │
3027
- │ --- ┆ --- ┆ --- │
3028
- │ u32 ┆ u32 ┆ u32 │
3029
- ╞══════╪═════╪══════╡
3030
- │ 4 ┆ 4 ┆ 3 │
3031
- │ 24 ┆ 24 ┆ 8 │
3032
- └──────┴─────┴──────┘
3033
- """ # noqa: RUF002
3034
- return wrap_expr(self._pyexpr.str_normalize(form))
3035
-
3036
-
3037
- def _validate_format_argument(format: str | None) -> None:
3038
- if format is not None and ".%f" in format:
3039
- message = (
3040
- "Detected the pattern `.%f` in the chrono format string."
3041
- " This pattern should not be used to parse values after a decimal point."
3042
- " Use `%.f` instead."
3043
- " See the full specification: https://docs.rs/chrono/latest/chrono/format/strftime"
3044
- )
3045
- warnings.warn(message, ChronoFormatWarning, stacklevel=find_stacklevel())