polars-runtime-compat 1.34.0b3__cp39-abi3-win_arm64.whl → 1.34.0b5__cp39-abi3-win_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of polars-runtime-compat might be problematic. Click here for more details.

Files changed (203) hide show
  1. _polars_runtime_compat/_polars_runtime_compat.pyd +0 -0
  2. {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b5.dist-info}/METADATA +6 -2
  3. polars_runtime_compat-1.34.0b5.dist-info/RECORD +6 -0
  4. polars/__init__.py +0 -528
  5. polars/_cpu_check.py +0 -265
  6. polars/_dependencies.py +0 -355
  7. polars/_plr.py +0 -99
  8. polars/_plr.pyi +0 -2496
  9. polars/_reexport.py +0 -23
  10. polars/_typing.py +0 -478
  11. polars/_utils/__init__.py +0 -37
  12. polars/_utils/async_.py +0 -102
  13. polars/_utils/cache.py +0 -176
  14. polars/_utils/cloud.py +0 -40
  15. polars/_utils/constants.py +0 -29
  16. polars/_utils/construction/__init__.py +0 -46
  17. polars/_utils/construction/dataframe.py +0 -1397
  18. polars/_utils/construction/other.py +0 -72
  19. polars/_utils/construction/series.py +0 -560
  20. polars/_utils/construction/utils.py +0 -118
  21. polars/_utils/convert.py +0 -224
  22. polars/_utils/deprecation.py +0 -406
  23. polars/_utils/getitem.py +0 -457
  24. polars/_utils/logging.py +0 -11
  25. polars/_utils/nest_asyncio.py +0 -264
  26. polars/_utils/parquet.py +0 -15
  27. polars/_utils/parse/__init__.py +0 -12
  28. polars/_utils/parse/expr.py +0 -242
  29. polars/_utils/polars_version.py +0 -19
  30. polars/_utils/pycapsule.py +0 -53
  31. polars/_utils/scan.py +0 -27
  32. polars/_utils/serde.py +0 -63
  33. polars/_utils/slice.py +0 -215
  34. polars/_utils/udfs.py +0 -1251
  35. polars/_utils/unstable.py +0 -63
  36. polars/_utils/various.py +0 -782
  37. polars/_utils/wrap.py +0 -25
  38. polars/api.py +0 -370
  39. polars/catalog/__init__.py +0 -0
  40. polars/catalog/unity/__init__.py +0 -19
  41. polars/catalog/unity/client.py +0 -733
  42. polars/catalog/unity/models.py +0 -152
  43. polars/config.py +0 -1571
  44. polars/convert/__init__.py +0 -25
  45. polars/convert/general.py +0 -1046
  46. polars/convert/normalize.py +0 -261
  47. polars/dataframe/__init__.py +0 -5
  48. polars/dataframe/_html.py +0 -186
  49. polars/dataframe/frame.py +0 -12582
  50. polars/dataframe/group_by.py +0 -1067
  51. polars/dataframe/plotting.py +0 -257
  52. polars/datatype_expr/__init__.py +0 -5
  53. polars/datatype_expr/array.py +0 -56
  54. polars/datatype_expr/datatype_expr.py +0 -304
  55. polars/datatype_expr/list.py +0 -18
  56. polars/datatype_expr/struct.py +0 -69
  57. polars/datatypes/__init__.py +0 -122
  58. polars/datatypes/_parse.py +0 -195
  59. polars/datatypes/_utils.py +0 -48
  60. polars/datatypes/classes.py +0 -1213
  61. polars/datatypes/constants.py +0 -11
  62. polars/datatypes/constructor.py +0 -172
  63. polars/datatypes/convert.py +0 -366
  64. polars/datatypes/group.py +0 -130
  65. polars/exceptions.py +0 -230
  66. polars/expr/__init__.py +0 -7
  67. polars/expr/array.py +0 -964
  68. polars/expr/binary.py +0 -346
  69. polars/expr/categorical.py +0 -306
  70. polars/expr/datetime.py +0 -2620
  71. polars/expr/expr.py +0 -11272
  72. polars/expr/list.py +0 -1408
  73. polars/expr/meta.py +0 -444
  74. polars/expr/name.py +0 -321
  75. polars/expr/string.py +0 -3045
  76. polars/expr/struct.py +0 -357
  77. polars/expr/whenthen.py +0 -185
  78. polars/functions/__init__.py +0 -193
  79. polars/functions/aggregation/__init__.py +0 -33
  80. polars/functions/aggregation/horizontal.py +0 -298
  81. polars/functions/aggregation/vertical.py +0 -341
  82. polars/functions/as_datatype.py +0 -848
  83. polars/functions/business.py +0 -138
  84. polars/functions/col.py +0 -384
  85. polars/functions/datatype.py +0 -121
  86. polars/functions/eager.py +0 -524
  87. polars/functions/escape_regex.py +0 -29
  88. polars/functions/lazy.py +0 -2751
  89. polars/functions/len.py +0 -68
  90. polars/functions/lit.py +0 -210
  91. polars/functions/random.py +0 -22
  92. polars/functions/range/__init__.py +0 -19
  93. polars/functions/range/_utils.py +0 -15
  94. polars/functions/range/date_range.py +0 -303
  95. polars/functions/range/datetime_range.py +0 -370
  96. polars/functions/range/int_range.py +0 -348
  97. polars/functions/range/linear_space.py +0 -311
  98. polars/functions/range/time_range.py +0 -287
  99. polars/functions/repeat.py +0 -301
  100. polars/functions/whenthen.py +0 -353
  101. polars/interchange/__init__.py +0 -10
  102. polars/interchange/buffer.py +0 -77
  103. polars/interchange/column.py +0 -190
  104. polars/interchange/dataframe.py +0 -230
  105. polars/interchange/from_dataframe.py +0 -328
  106. polars/interchange/protocol.py +0 -303
  107. polars/interchange/utils.py +0 -170
  108. polars/io/__init__.py +0 -64
  109. polars/io/_utils.py +0 -317
  110. polars/io/avro.py +0 -49
  111. polars/io/clipboard.py +0 -36
  112. polars/io/cloud/__init__.py +0 -17
  113. polars/io/cloud/_utils.py +0 -80
  114. polars/io/cloud/credential_provider/__init__.py +0 -17
  115. polars/io/cloud/credential_provider/_builder.py +0 -520
  116. polars/io/cloud/credential_provider/_providers.py +0 -618
  117. polars/io/csv/__init__.py +0 -9
  118. polars/io/csv/_utils.py +0 -38
  119. polars/io/csv/batched_reader.py +0 -142
  120. polars/io/csv/functions.py +0 -1495
  121. polars/io/database/__init__.py +0 -6
  122. polars/io/database/_arrow_registry.py +0 -70
  123. polars/io/database/_cursor_proxies.py +0 -147
  124. polars/io/database/_executor.py +0 -578
  125. polars/io/database/_inference.py +0 -314
  126. polars/io/database/_utils.py +0 -144
  127. polars/io/database/functions.py +0 -516
  128. polars/io/delta.py +0 -499
  129. polars/io/iceberg/__init__.py +0 -3
  130. polars/io/iceberg/_utils.py +0 -697
  131. polars/io/iceberg/dataset.py +0 -556
  132. polars/io/iceberg/functions.py +0 -151
  133. polars/io/ipc/__init__.py +0 -8
  134. polars/io/ipc/functions.py +0 -514
  135. polars/io/json/__init__.py +0 -3
  136. polars/io/json/read.py +0 -101
  137. polars/io/ndjson.py +0 -332
  138. polars/io/parquet/__init__.py +0 -17
  139. polars/io/parquet/field_overwrites.py +0 -140
  140. polars/io/parquet/functions.py +0 -722
  141. polars/io/partition.py +0 -491
  142. polars/io/plugins.py +0 -187
  143. polars/io/pyarrow_dataset/__init__.py +0 -5
  144. polars/io/pyarrow_dataset/anonymous_scan.py +0 -109
  145. polars/io/pyarrow_dataset/functions.py +0 -79
  146. polars/io/scan_options/__init__.py +0 -5
  147. polars/io/scan_options/_options.py +0 -59
  148. polars/io/scan_options/cast_options.py +0 -126
  149. polars/io/spreadsheet/__init__.py +0 -6
  150. polars/io/spreadsheet/_utils.py +0 -52
  151. polars/io/spreadsheet/_write_utils.py +0 -647
  152. polars/io/spreadsheet/functions.py +0 -1323
  153. polars/lazyframe/__init__.py +0 -9
  154. polars/lazyframe/engine_config.py +0 -61
  155. polars/lazyframe/frame.py +0 -8564
  156. polars/lazyframe/group_by.py +0 -669
  157. polars/lazyframe/in_process.py +0 -42
  158. polars/lazyframe/opt_flags.py +0 -333
  159. polars/meta/__init__.py +0 -14
  160. polars/meta/build.py +0 -33
  161. polars/meta/index_type.py +0 -27
  162. polars/meta/thread_pool.py +0 -50
  163. polars/meta/versions.py +0 -120
  164. polars/ml/__init__.py +0 -0
  165. polars/ml/torch.py +0 -213
  166. polars/ml/utilities.py +0 -30
  167. polars/plugins.py +0 -155
  168. polars/py.typed +0 -0
  169. polars/pyproject.toml +0 -103
  170. polars/schema.py +0 -265
  171. polars/selectors.py +0 -3117
  172. polars/series/__init__.py +0 -5
  173. polars/series/array.py +0 -776
  174. polars/series/binary.py +0 -254
  175. polars/series/categorical.py +0 -246
  176. polars/series/datetime.py +0 -2275
  177. polars/series/list.py +0 -1087
  178. polars/series/plotting.py +0 -191
  179. polars/series/series.py +0 -9197
  180. polars/series/string.py +0 -2367
  181. polars/series/struct.py +0 -154
  182. polars/series/utils.py +0 -191
  183. polars/sql/__init__.py +0 -7
  184. polars/sql/context.py +0 -677
  185. polars/sql/functions.py +0 -139
  186. polars/string_cache.py +0 -185
  187. polars/testing/__init__.py +0 -13
  188. polars/testing/asserts/__init__.py +0 -9
  189. polars/testing/asserts/frame.py +0 -231
  190. polars/testing/asserts/series.py +0 -219
  191. polars/testing/asserts/utils.py +0 -12
  192. polars/testing/parametric/__init__.py +0 -33
  193. polars/testing/parametric/profiles.py +0 -107
  194. polars/testing/parametric/strategies/__init__.py +0 -22
  195. polars/testing/parametric/strategies/_utils.py +0 -14
  196. polars/testing/parametric/strategies/core.py +0 -615
  197. polars/testing/parametric/strategies/data.py +0 -452
  198. polars/testing/parametric/strategies/dtype.py +0 -436
  199. polars/testing/parametric/strategies/legacy.py +0 -169
  200. polars/type_aliases.py +0 -24
  201. polars_runtime_compat-1.34.0b3.dist-info/RECORD +0 -203
  202. {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b5.dist-info}/WHEEL +0 -0
  203. {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b5.dist-info}/licenses/LICENSE +0 -0
polars/series/string.py DELETED
@@ -1,2367 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from typing import TYPE_CHECKING
4
-
5
- import polars._reexport as pl
6
- import polars.functions as F
7
- from polars._utils.deprecation import deprecate_nonkeyword_arguments, deprecated
8
- from polars._utils.unstable import unstable
9
- from polars._utils.various import no_default
10
- from polars._utils.wrap import wrap_s
11
- from polars.datatypes import Int64
12
- from polars.datatypes.classes import Datetime
13
- from polars.datatypes.constants import N_INFER_DEFAULT
14
- from polars.series.utils import expr_dispatch
15
-
16
- if TYPE_CHECKING:
17
- import sys
18
- from collections.abc import Mapping
19
-
20
- from polars import Expr, Series
21
- from polars._plr import PySeries
22
- from polars._typing import (
23
- Ambiguous,
24
- IntoExpr,
25
- IntoExprColumn,
26
- PolarsDataType,
27
- PolarsIntegerType,
28
- PolarsTemporalType,
29
- TimeUnit,
30
- TransferEncoding,
31
- UnicodeForm,
32
- )
33
- from polars._utils.various import NoDefault
34
-
35
- if sys.version_info >= (3, 13):
36
- from warnings import deprecated
37
- else:
38
- from typing_extensions import deprecated # noqa: TC004
39
-
40
-
41
- @expr_dispatch
42
- class StringNameSpace:
43
- """Series.str namespace."""
44
-
45
- _accessor = "str"
46
-
47
- def __init__(self, series: Series) -> None:
48
- self._s: PySeries = series._s
49
-
50
- def to_date(
51
- self,
52
- format: str | None = None,
53
- *,
54
- strict: bool = True,
55
- exact: bool = True,
56
- cache: bool = True,
57
- ) -> Series:
58
- """
59
- Convert a String column into a Date column.
60
-
61
- Parameters
62
- ----------
63
- format
64
- Format to use for conversion. Refer to the `chrono crate documentation
65
- <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
66
- for the full specification. Example: `"%Y-%m-%d"`.
67
- If set to None (default), the format is inferred from the data.
68
- strict
69
- Raise an error if any conversion fails.
70
- exact
71
- Require an exact format match. If False, allow the format to match anywhere
72
- in the target string.
73
-
74
- .. note::
75
- Using `exact=False` introduces a performance penalty - cleaning your
76
- data beforehand will almost certainly be more performant.
77
- cache
78
- Use a cache of unique, converted dates to apply the conversion.
79
-
80
- Examples
81
- --------
82
- >>> s = pl.Series(["2020/01/01", "2020/02/01", "2020/03/01"])
83
- >>> s.str.to_date()
84
- shape: (3,)
85
- Series: '' [date]
86
- [
87
- 2020-01-01
88
- 2020-02-01
89
- 2020-03-01
90
- ]
91
- """
92
-
93
- def to_datetime(
94
- self,
95
- format: str | None = None,
96
- *,
97
- time_unit: TimeUnit | None = None,
98
- time_zone: str | None = None,
99
- strict: bool = True,
100
- exact: bool = True,
101
- cache: bool = True,
102
- ambiguous: Ambiguous | pl.Series = "raise",
103
- ) -> pl.Series:
104
- """
105
- Convert a String column into a Datetime column.
106
-
107
- Parameters
108
- ----------
109
- format
110
- Format to use for conversion. Refer to the `chrono crate documentation
111
- <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
112
- for the full specification. Example: `"%Y-%m-%d %H:%M:%S"`.
113
- If set to None (default), the format is inferred from the data.
114
- time_unit : {None, 'us', 'ns', 'ms'}
115
- Unit of time for the resulting Datetime column. If set to None (default),
116
- the time unit is inferred from the format string if given, eg:
117
- `"%F %T%.3f"` => `Datetime("ms")`. If no fractional second component is
118
- found, the default is `"us"`.
119
- time_zone
120
- Time zone for the resulting Datetime column. Rules are:
121
-
122
- - If inputs are tz-naive and `time_zone` is None, the result time zone is
123
- `None`.
124
- - If inputs are offset-aware and `time_zone` is None, inputs are converted
125
- to `'UTC'` and the result time zone is `'UTC'`.
126
- - If inputs are offset-aware and `time_zone` is given, inputs are converted
127
- to `time_zone` and the result time zone is `time_zone`.
128
- - If inputs are tz-naive and `time_zone` is given, input time zones are
129
- replaced with (not converted to!) `time_zone`, and the result time zone
130
- is `time_zone`.
131
- strict
132
- Raise an error if any conversion fails.
133
- exact
134
- Require an exact format match. If False, allow the format to match anywhere
135
- in the target string.
136
-
137
- .. note::
138
- Using `exact=False` introduces a performance penalty - cleaning your
139
- data beforehand will almost certainly be more performant.
140
- cache
141
- Use a cache of unique, converted datetimes to apply the conversion.
142
- ambiguous
143
- Determine how to deal with ambiguous datetimes:
144
-
145
- - `'raise'` (default): raise
146
- - `'earliest'`: use the earliest datetime
147
- - `'latest'`: use the latest datetime
148
- - `'null'`: set to null
149
-
150
- Examples
151
- --------
152
- >>> s = pl.Series(["2020-01-01 01:00Z", "2020-01-01 02:00Z"])
153
- >>> s.str.to_datetime("%Y-%m-%d %H:%M%#z")
154
- shape: (2,)
155
- Series: '' [datetime[μs, UTC]]
156
- [
157
- 2020-01-01 01:00:00 UTC
158
- 2020-01-01 02:00:00 UTC
159
- ]
160
- """
161
- if format is None and time_zone is None:
162
- if isinstance(ambiguous, str):
163
- ambiguous_s = pl.Series([ambiguous])
164
- else:
165
- ambiguous_s = ambiguous
166
-
167
- return wrap_s(
168
- self._s.str_to_datetime_infer(
169
- time_unit,
170
- strict,
171
- exact,
172
- ambiguous_s._s,
173
- )
174
- )
175
- else:
176
- ambiguous_expr = F.lit(ambiguous)
177
- s = wrap_s(self._s)
178
- return (
179
- s.to_frame()
180
- .select_seq(
181
- F.col(s.name).str.to_datetime(
182
- format,
183
- time_unit=time_unit,
184
- time_zone=time_zone,
185
- strict=strict,
186
- exact=exact,
187
- cache=cache,
188
- ambiguous=ambiguous_expr,
189
- )
190
- )
191
- .to_series()
192
- )
193
-
194
- def to_time(
195
- self,
196
- format: str | None = None,
197
- *,
198
- strict: bool = True,
199
- cache: bool = True,
200
- ) -> Series:
201
- """
202
- Convert a String column into a Time column.
203
-
204
- Parameters
205
- ----------
206
- format
207
- Format to use for conversion. Refer to the `chrono crate documentation
208
- <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
209
- for the full specification. Example: `"%H:%M:%S"`.
210
- If set to None (default), the format is inferred from the data.
211
- strict
212
- Raise an error if any conversion fails.
213
- cache
214
- Use a cache of unique, converted times to apply the conversion.
215
-
216
- Examples
217
- --------
218
- >>> s = pl.Series(["01:00", "02:00", "03:00"])
219
- >>> s.str.to_time("%H:%M")
220
- shape: (3,)
221
- Series: '' [time]
222
- [
223
- 01:00:00
224
- 02:00:00
225
- 03:00:00
226
- ]
227
- """
228
-
229
- def strptime(
230
- self,
231
- dtype: PolarsTemporalType,
232
- format: str | None = None,
233
- *,
234
- strict: bool = True,
235
- exact: bool = True,
236
- cache: bool = True,
237
- ambiguous: Ambiguous | Series = "raise",
238
- ) -> Series:
239
- """
240
- Convert a String column into a Date/Datetime/Time column.
241
-
242
- Parameters
243
- ----------
244
- dtype
245
- The data type to convert to. Can be either Date, Datetime, or Time.
246
- format
247
- Format to use for conversion. Refer to the `chrono crate documentation
248
- <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
249
- for the full specification. Example: `"%Y-%m-%d %H:%M:%S"`.
250
- If set to None (default), the format is inferred from the data.
251
- strict
252
- Raise an error if any conversion fails.
253
- exact
254
- Require an exact format match. If False, allow the format to match anywhere
255
- in the target string. Conversion to the Time type is always exact.
256
-
257
- .. note::
258
- Using `exact=False` introduces a performance penalty - cleaning your
259
- data beforehand will almost certainly be more performant.
260
- cache
261
- Use a cache of unique, converted dates to apply the datetime conversion.
262
- ambiguous
263
- Determine how to deal with ambiguous datetimes:
264
-
265
- - `'raise'` (default): raise
266
- - `'earliest'`: use the earliest datetime
267
- - `'latest'`: use the latest datetime
268
- - `'null'`: set to null
269
-
270
- Notes
271
- -----
272
- When converting to a Datetime type, the time unit is inferred from the format
273
- string if given, eg: `"%F %T%.3f"` => `Datetime("ms")`. If no fractional
274
- second component is found, the default is `"us"`.
275
-
276
- Examples
277
- --------
278
- Dealing with a consistent format:
279
-
280
- >>> s = pl.Series(["2020-01-01 01:00Z", "2020-01-01 02:00Z"])
281
- >>> s.str.strptime(pl.Datetime, "%Y-%m-%d %H:%M%#z")
282
- shape: (2,)
283
- Series: '' [datetime[μs, UTC]]
284
- [
285
- 2020-01-01 01:00:00 UTC
286
- 2020-01-01 02:00:00 UTC
287
- ]
288
-
289
- Dealing with different formats.
290
-
291
- >>> s = pl.Series(
292
- ... "date",
293
- ... [
294
- ... "2021-04-22",
295
- ... "2022-01-04 00:00:00",
296
- ... "01/31/22",
297
- ... "Sun Jul 8 00:34:60 2001",
298
- ... ],
299
- ... )
300
- >>> s.to_frame().select(
301
- ... pl.coalesce(
302
- ... pl.col("date").str.strptime(pl.Date, "%F", strict=False),
303
- ... pl.col("date").str.strptime(pl.Date, "%F %T", strict=False),
304
- ... pl.col("date").str.strptime(pl.Date, "%D", strict=False),
305
- ... pl.col("date").str.strptime(pl.Date, "%c", strict=False),
306
- ... )
307
- ... ).to_series()
308
- shape: (4,)
309
- Series: 'date' [date]
310
- [
311
- 2021-04-22
312
- 2022-01-04
313
- 2022-01-31
314
- 2001-07-08
315
- ]
316
- """
317
- if format is None and (
318
- dtype is Datetime
319
- or (isinstance(dtype, Datetime) and dtype.time_zone is None)
320
- ):
321
- time_unit = None
322
- if isinstance(dtype, Datetime):
323
- time_unit = dtype.time_unit
324
-
325
- return self.to_datetime(
326
- time_unit=time_unit,
327
- strict=strict,
328
- exact=exact,
329
- cache=cache,
330
- ambiguous=ambiguous,
331
- )
332
- else:
333
- ambiguous_expr = F.lit(ambiguous)
334
- s = wrap_s(self._s)
335
- return (
336
- s.to_frame()
337
- .select_seq(
338
- F.col(s.name).str.strptime(
339
- dtype,
340
- format,
341
- strict=strict,
342
- exact=exact,
343
- cache=cache,
344
- ambiguous=ambiguous_expr,
345
- )
346
- )
347
- .to_series()
348
- )
349
-
350
- @deprecate_nonkeyword_arguments(allowed_args=["self"], version="1.20.0")
351
- def to_decimal(
352
- self,
353
- inference_length: int = 100,
354
- *,
355
- scale: int | None = None,
356
- ) -> Series:
357
- """
358
- Convert a String column into a Decimal column.
359
-
360
- This method infers the needed parameters `precision` and `scale` if not
361
- given.
362
-
363
- .. versionchanged:: 1.20.0
364
- Parameter `inference_length` should now be passed as a keyword argument.
365
-
366
- Parameters
367
- ----------
368
- inference_length
369
- Number of elements to parse to determine the `precision` and `scale`
370
- scale
371
- Number of digits after the comma to use for the decimals.
372
-
373
- Examples
374
- --------
375
- >>> s = pl.Series(
376
- ... ["40.12", "3420.13", "120134.19", "3212.98", "12.90", "143.09", "143.9"]
377
- ... )
378
- >>> s.str.to_decimal()
379
- shape: (7,)
380
- Series: '' [decimal[8,2]]
381
- [
382
- 40.12
383
- 3420.13
384
- 120134.19
385
- 3212.98
386
- 12.90
387
- 143.09
388
- 143.90
389
- ]
390
- """
391
- if scale is not None:
392
- s = wrap_s(self._s)
393
- return (
394
- s.to_frame()
395
- .select_seq(F.col(s.name).str.to_decimal(scale=scale))
396
- .to_series()
397
- )
398
- else:
399
- return wrap_s(
400
- self._s.str_to_decimal_infer(inference_length=inference_length)
401
- )
402
-
403
- def len_bytes(self) -> Series:
404
- """
405
- Return the length of each string as the number of bytes.
406
-
407
- Returns
408
- -------
409
- Series
410
- Series of data type :class:`UInt32`.
411
-
412
- See Also
413
- --------
414
- len_chars
415
-
416
- Notes
417
- -----
418
- When working with non-ASCII text, the length in bytes is not the same as the
419
- length in characters. You may want to use :func:`len_chars` instead.
420
- Note that :func:`len_bytes` is much more performant (_O(1)_) than
421
- :func:`len_chars` (_O(n)_).
422
-
423
- Examples
424
- --------
425
- >>> s = pl.Series(["Café", "345", "東京", None])
426
- >>> s.str.len_bytes()
427
- shape: (4,)
428
- Series: '' [u32]
429
- [
430
- 5
431
- 3
432
- 6
433
- null
434
- ]
435
- """
436
-
437
- def len_chars(self) -> Series:
438
- """
439
- Return the length of each string as the number of characters.
440
-
441
- Returns
442
- -------
443
- Series
444
- Series of data type :class:`UInt32`.
445
-
446
- See Also
447
- --------
448
- len_bytes
449
-
450
- Notes
451
- -----
452
- When working with ASCII text, use :func:`len_bytes` instead to achieve
453
- equivalent output with much better performance:
454
- :func:`len_bytes` runs in _O(1)_, while :func:`len_chars` runs in (_O(n)_).
455
-
456
- A character is defined as a `Unicode scalar value`_. A single character is
457
- represented by a single byte when working with ASCII text, and a maximum of
458
- 4 bytes otherwise.
459
-
460
- .. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value
461
-
462
- Examples
463
- --------
464
- >>> s = pl.Series(["Café", "345", "東京", None])
465
- >>> s.str.len_chars()
466
- shape: (4,)
467
- Series: '' [u32]
468
- [
469
- 4
470
- 3
471
- 2
472
- null
473
- ]
474
- """
475
-
476
- def contains(
477
- self, pattern: str | Expr, *, literal: bool = False, strict: bool = True
478
- ) -> Series:
479
- """
480
- Check if the string contains a substring that matches a pattern.
481
-
482
- Parameters
483
- ----------
484
- pattern
485
- A valid regular expression pattern, compatible with the `regex crate
486
- <https://docs.rs/regex/latest/regex/>`_.
487
- literal
488
- Treat `pattern` as a literal string, not as a regular expression.
489
- strict
490
- Raise an error if the underlying pattern is not a valid regex,
491
- otherwise mask out with a null value.
492
-
493
- Notes
494
- -----
495
- To modify regular expression behaviour (such as case-sensitivity) with
496
- flags, use the inline `(?iLmsuxU)` syntax. For example:
497
-
498
- Default (case-sensitive) match:
499
-
500
- >>> s = pl.Series("s", ["AAA", "aAa", "aaa"])
501
- >>> s.str.contains("AA").to_list()
502
- [True, False, False]
503
-
504
- Case-insensitive match, using an inline flag:
505
-
506
- >>> s = pl.Series("s", ["AAA", "aAa", "aaa"])
507
- >>> s.str.contains("(?i)AA").to_list()
508
- [True, True, True]
509
-
510
- See the regex crate's section on `grouping and flags
511
- <https://docs.rs/regex/latest/regex/#grouping-and-flags>`_ for
512
- additional information about the use of inline expression modifiers.
513
-
514
- Returns
515
- -------
516
- Series
517
- Series of data type :class:`Boolean`.
518
-
519
- Examples
520
- --------
521
- >>> s = pl.Series(["Crab", "cat and dog", "rab$bit", None])
522
- >>> s.str.contains("cat|bit")
523
- shape: (4,)
524
- Series: '' [bool]
525
- [
526
- false
527
- true
528
- true
529
- null
530
- ]
531
- >>> s.str.contains("rab$", literal=True)
532
- shape: (4,)
533
- Series: '' [bool]
534
- [
535
- false
536
- false
537
- true
538
- null
539
- ]
540
- """
541
-
542
- def find(
543
- self, pattern: str | Expr, *, literal: bool = False, strict: bool = True
544
- ) -> Series:
545
- """
546
- Return the bytes offset of the first substring matching a pattern.
547
-
548
- If the pattern is not found, returns None.
549
-
550
- Parameters
551
- ----------
552
- pattern
553
- A valid regular expression pattern, compatible with the `regex crate
554
- <https://docs.rs/regex/latest/regex/>`_.
555
- literal
556
- Treat `pattern` as a literal string, not as a regular expression.
557
- strict
558
- Raise an error if the underlying pattern is not a valid regex,
559
- otherwise mask out with a null value.
560
-
561
- Notes
562
- -----
563
- To modify regular expression behaviour (such as case-sensitivity) with
564
- flags, use the inline `(?iLmsuxU)` syntax. For example:
565
-
566
- >>> s = pl.Series("s", ["AAA", "aAa", "aaa"])
567
-
568
- Default (case-sensitive) match:
569
-
570
- >>> s.str.find("Aa").to_list()
571
- [None, 1, None]
572
-
573
- Case-insensitive match, using an inline flag:
574
-
575
- >>> s.str.find("(?i)Aa").to_list()
576
- [0, 0, 0]
577
-
578
- See the regex crate's section on `grouping and flags
579
- <https://docs.rs/regex/latest/regex/#grouping-and-flags>`_ for
580
- additional information about the use of inline expression modifiers.
581
-
582
- See Also
583
- --------
584
- contains : Check if the string contains a substring that matches a pattern.
585
-
586
- Examples
587
- --------
588
- >>> s = pl.Series("txt", ["Crab", "Lobster", None, "Crustacean"])
589
-
590
- Find the index of the first substring matching a regex pattern:
591
-
592
- >>> s.str.find("a|e").rename("idx_rx")
593
- shape: (4,)
594
- Series: 'idx_rx' [u32]
595
- [
596
- 2
597
- 5
598
- null
599
- 5
600
- ]
601
-
602
- Find the index of the first substring matching a literal pattern:
603
-
604
- >>> s.str.find("e", literal=True).rename("idx_lit")
605
- shape: (4,)
606
- Series: 'idx_lit' [u32]
607
- [
608
- null
609
- 5
610
- null
611
- 7
612
- ]
613
-
614
- Match against a pattern found in another column or (expression):
615
-
616
- >>> p = pl.Series("pat", ["a[bc]", "b.t", "[aeiuo]", "(?i)A[BC]"])
617
- >>> s.str.find(p).rename("idx")
618
- shape: (4,)
619
- Series: 'idx' [u32]
620
- [
621
- 2
622
- 2
623
- null
624
- 5
625
- ]
626
- """
627
-
628
- def ends_with(self, suffix: str | Expr | None) -> Series:
629
- """
630
- Check if string values end with a substring.
631
-
632
- Parameters
633
- ----------
634
- suffix
635
- Suffix substring.
636
-
637
- See Also
638
- --------
639
- contains : Check if the string contains a substring that matches a pattern.
640
- starts_with : Check if string values start with a substring.
641
-
642
- Examples
643
- --------
644
- >>> s = pl.Series("fruits", ["apple", "mango", None])
645
- >>> s.str.ends_with("go")
646
- shape: (3,)
647
- Series: 'fruits' [bool]
648
- [
649
- false
650
- true
651
- null
652
- ]
653
- """
654
-
655
- def starts_with(self, prefix: str | Expr) -> Series:
656
- """
657
- Check if string values start with a substring.
658
-
659
- Parameters
660
- ----------
661
- prefix
662
- Prefix substring.
663
-
664
- See Also
665
- --------
666
- contains : Check if the string contains a substring that matches a pattern.
667
- ends_with : Check if string values end with a substring.
668
-
669
- Examples
670
- --------
671
- >>> s = pl.Series("fruits", ["apple", "mango", None])
672
- >>> s.str.starts_with("app")
673
- shape: (3,)
674
- Series: 'fruits' [bool]
675
- [
676
- true
677
- false
678
- null
679
- ]
680
- """
681
-
682
- def decode(self, encoding: TransferEncoding, *, strict: bool = True) -> Series:
683
- r"""
684
- Decode values using the provided encoding.
685
-
686
- Parameters
687
- ----------
688
- encoding : {'hex', 'base64'}
689
- The encoding to use.
690
- strict
691
- Raise an error if the underlying value cannot be decoded,
692
- otherwise mask out with a null value.
693
-
694
- Returns
695
- -------
696
- Series
697
- Series of data type :class:`Binary`.
698
-
699
- Examples
700
- --------
701
- >>> s = pl.Series("color", ["000000", "ffff00", "0000ff"])
702
- >>> s.str.decode("hex")
703
- shape: (3,)
704
- Series: 'color' [binary]
705
- [
706
- b"\x00\x00\x00"
707
- b"\xff\xff\x00"
708
- b"\x00\x00\xff"
709
- ]
710
- """
711
-
712
- def encode(self, encoding: TransferEncoding) -> Series:
713
- """
714
- Encode a value using the provided encoding.
715
-
716
- Parameters
717
- ----------
718
- encoding : {'hex', 'base64'}
719
- The encoding to use.
720
-
721
- Returns
722
- -------
723
- Series
724
- Series of data type :class:`String`.
725
-
726
- Examples
727
- --------
728
- >>> s = pl.Series(["foo", "bar", None])
729
- >>> s.str.encode("hex")
730
- shape: (3,)
731
- Series: '' [str]
732
- [
733
- "666f6f"
734
- "626172"
735
- null
736
- ]
737
- """
738
-
739
- def json_decode(
740
- self,
741
- dtype: PolarsDataType | None = None,
742
- *,
743
- infer_schema_length: int | None = N_INFER_DEFAULT,
744
- ) -> Series:
745
- """
746
- Parse string values as JSON.
747
-
748
- Throws an error if invalid JSON strings are encountered.
749
-
750
- Parameters
751
- ----------
752
- dtype
753
- The dtype to cast the extracted value to. If None, the dtype will be
754
- inferred from the JSON value.
755
- infer_schema_length
756
- The maximum number of rows to scan for schema inference.
757
- If set to `None`, the full data may be scanned *(this is slow)*.
758
-
759
- See Also
760
- --------
761
- json_path_match : Extract the first match of json string with provided JSONPath
762
- expression.
763
-
764
- Examples
765
- --------
766
- >>> s = pl.Series("json", ['{"a":1, "b": true}', None, '{"a":2, "b": false}'])
767
- >>> s.str.json_decode()
768
- shape: (3,)
769
- Series: 'json' [struct[2]]
770
- [
771
- {1,true}
772
- null
773
- {2,false}
774
- ]
775
- """
776
- if dtype is not None:
777
- s = wrap_s(self._s)
778
- return (
779
- s.to_frame()
780
- .select_seq(F.col(s.name).str.json_decode(dtype))
781
- .to_series()
782
- )
783
-
784
- return wrap_s(self._s.str_json_decode(infer_schema_length))
785
-
786
- def json_path_match(self, json_path: IntoExprColumn) -> Series:
787
- """
788
- Extract the first match of JSON string with provided JSONPath expression.
789
-
790
- Throw errors if encounter invalid JSON strings.
791
- All return values will be cast to String regardless of the original value.
792
-
793
- Documentation on JSONPath standard can be found
794
- `here <https://goessner.net/articles/JsonPath/>`_.
795
-
796
- Parameters
797
- ----------
798
- json_path
799
- A valid JSON path query string.
800
-
801
- Returns
802
- -------
803
- Series
804
- Series of data type :class:`String`. Contains null values if the original
805
- value is null or the json_path returns nothing.
806
-
807
- Examples
808
- --------
809
- >>> df = pl.DataFrame(
810
- ... {"json_val": ['{"a":"1"}', None, '{"a":2}', '{"a":2.1}', '{"a":true}']}
811
- ... )
812
- >>> df.select(pl.col("json_val").str.json_path_match("$.a"))[:, 0]
813
- shape: (5,)
814
- Series: 'json_val' [str]
815
- [
816
- "1"
817
- null
818
- "2"
819
- "2.1"
820
- "true"
821
- ]
822
- """
823
-
824
- def extract(self, pattern: IntoExprColumn, group_index: int = 1) -> Series:
825
- r"""
826
- Extract the target capture group from provided patterns.
827
-
828
- Parameters
829
- ----------
830
- pattern
831
- A valid regular expression pattern containing at least one capture group,
832
- compatible with the `regex crate <https://docs.rs/regex/latest/regex/>`_.
833
- group_index
834
- Index of the targeted capture group.
835
- Group 0 means the whole pattern, the first group begins at index 1.
836
- Defaults to the first capture group.
837
-
838
- Returns
839
- -------
840
- Series
841
- Series of data type :class:`String`. Contains null values if the original
842
- value is null or regex captures nothing.
843
-
844
- Notes
845
- -----
846
- To modify regular expression behaviour (such as multi-line matching)
847
- with flags, use the inline `(?iLmsuxU)` syntax. For example:
848
-
849
- >>> s = pl.Series(
850
- ... name="lines",
851
- ... values=[
852
- ... "I Like\nThose\nOdds",
853
- ... "This is\nThe Way",
854
- ... ],
855
- ... )
856
- >>> s.str.extract(r"(?m)^(T\w+)", 1).alias("matches")
857
- shape: (2,)
858
- Series: 'matches' [str]
859
- [
860
- "Those"
861
- "This"
862
- ]
863
-
864
- See the regex crate's section on `grouping and flags
865
- <https://docs.rs/regex/latest/regex/#grouping-and-flags>`_ for
866
- additional information about the use of inline expression modifiers.
867
-
868
- Examples
869
- --------
870
- >>> s = pl.Series(
871
- ... name="url",
872
- ... values=[
873
- ... "http://vote.com/ballon_dor?ref=polars&candidate=messi",
874
- ... "http://vote.com/ballon_dor?candidate=ronaldo&ref=polars",
875
- ... "http://vote.com/ballon_dor?error=404&ref=unknown",
876
- ... ],
877
- ... )
878
- >>> s.str.extract(r"candidate=(\w+)", 1).alias("candidate")
879
- shape: (3,)
880
- Series: 'candidate' [str]
881
- [
882
- "messi"
883
- "ronaldo"
884
- null
885
- ]
886
- """
887
-
888
- def extract_all(self, pattern: str | Series) -> Series:
889
- r'''
890
- Extract all matches for the given regex pattern.
891
-
892
- Extract each successive non-overlapping regex match in an individual string
893
- as a list. If the haystack string is `null`, `null` is returned.
894
-
895
- Parameters
896
- ----------
897
- pattern
898
- A valid regular expression pattern, compatible with the `regex crate
899
- <https://docs.rs/regex/latest/regex/>`_.
900
-
901
- Notes
902
- -----
903
- To modify regular expression behaviour (such as "verbose" mode and/or
904
- case-sensitive matching) with flags, use the inline `(?iLmsuxU)` syntax.
905
- For example:
906
-
907
- >>> s = pl.Series(
908
- ... name="email",
909
- ... values=[
910
- ... "real.email@spam.com",
911
- ... "some_account@somewhere.net",
912
- ... "abc.def.ghi.jkl@uvw.xyz.co.uk",
913
- ... ],
914
- ... )
915
- >>> # extract name/domain parts from email, using verbose regex
916
- >>> s.str.extract_all(
917
- ... r"""(?xi) # activate 'verbose' and 'case-insensitive' flags
918
- ... [ # (start character group)
919
- ... A-Z # letters
920
- ... 0-9 # digits
921
- ... ._%+\- # special chars
922
- ... ] # (end character group)
923
- ... + # 'one or more' quantifier
924
- ... """
925
- ... ).alias("email_parts")
926
- shape: (3,)
927
- Series: 'email_parts' [list[str]]
928
- [
929
- ["real.email", "spam.com"]
930
- ["some_account", "somewhere.net"]
931
- ["abc.def.ghi.jkl", "uvw.xyz.co.uk"]
932
- ]
933
-
934
- See the regex crate's section on `grouping and flags
935
- <https://docs.rs/regex/latest/regex/#grouping-and-flags>`_ for
936
- additional information about the use of inline expression modifiers.
937
-
938
- Returns
939
- -------
940
- Series
941
- Series of data type `List(String)`.
942
-
943
- Examples
944
- --------
945
- >>> s = pl.Series("foo", ["123 bla 45 asd", "xyz 678 910t", "bar", None])
946
- >>> s.str.extract_all(r"\d+")
947
- shape: (4,)
948
- Series: 'foo' [list[str]]
949
- [
950
- ["123", "45"]
951
- ["678", "910"]
952
- []
953
- null
954
- ]
955
-
956
- '''
957
-
958
- def extract_groups(self, pattern: str) -> Series:
959
- r"""
960
- Extract all capture groups for the given regex pattern.
961
-
962
- Parameters
963
- ----------
964
- pattern
965
- A valid regular expression pattern containing at least one capture group,
966
- compatible with the `regex crate <https://docs.rs/regex/latest/regex/>`_.
967
-
968
- Notes
969
- -----
970
- All group names are **strings**.
971
-
972
- If your pattern contains unnamed groups, their numerical position is converted
973
- to a string.
974
-
975
- For example, we can access the first group via the string `"1"`::
976
-
977
- >>> (
978
- ... pl.Series(["foo bar baz"])
979
- ... .str.extract_groups(r"(\w+) (.+) (\w+)")
980
- ... .struct["1"]
981
- ... )
982
- shape: (1,)
983
- Series: '1' [str]
984
- [
985
- "foo"
986
- ]
987
-
988
- Returns
989
- -------
990
- Series
991
- Series of data type :class:`Struct` with fields of data type
992
- :class:`String`.
993
-
994
- Examples
995
- --------
996
- >>> s = pl.Series(
997
- ... name="url",
998
- ... values=[
999
- ... "http://vote.com/ballon_dor?candidate=messi&ref=python",
1000
- ... "http://vote.com/ballon_dor?candidate=weghorst&ref=polars",
1001
- ... "http://vote.com/ballon_dor?error=404&ref=rust",
1002
- ... ],
1003
- ... )
1004
- >>> s.str.extract_groups(r"candidate=(?<candidate>\w+)&ref=(?<ref>\w+)")
1005
- shape: (3,)
1006
- Series: 'url' [struct[2]]
1007
- [
1008
- {"messi","python"}
1009
- {"weghorst","polars"}
1010
- {null,null}
1011
- ]
1012
- """
1013
-
1014
- def count_matches(self, pattern: str | Series, *, literal: bool = False) -> Series:
1015
- r"""
1016
- Count all successive non-overlapping regex matches.
1017
-
1018
- Parameters
1019
- ----------
1020
- pattern
1021
- A valid regular expression pattern, compatible with the `regex crate
1022
- <https://docs.rs/regex/latest/regex/>`_. Can also be a :class:`Series` of
1023
- regular expressions.
1024
- literal
1025
- Treat `pattern` as a literal string, not as a regular expression.
1026
-
1027
- Returns
1028
- -------
1029
- Series
1030
- Series of data type :class:`UInt32`. Returns null if the original
1031
- value is null.
1032
-
1033
- Examples
1034
- --------
1035
- >>> s = pl.Series("foo", ["123 bla 45 asd", "xyz 678 910t", "bar", None])
1036
- >>> # count digits
1037
- >>> s.str.count_matches(r"\d")
1038
- shape: (4,)
1039
- Series: 'foo' [u32]
1040
- [
1041
- 5
1042
- 6
1043
- 0
1044
- null
1045
- ]
1046
-
1047
- >>> s = pl.Series("bar", ["12 dbc 3xy", "cat\\w", "1zy3\\d\\d", None])
1048
- >>> s.str.count_matches(r"\d", literal=True)
1049
- shape: (4,)
1050
- Series: 'bar' [u32]
1051
- [
1052
- 0
1053
- 0
1054
- 2
1055
- null
1056
- ]
1057
- """
1058
-
1059
- def split(self, by: IntoExpr, *, inclusive: bool = False) -> Series:
1060
- """
1061
- Split the string by a substring.
1062
-
1063
- Parameters
1064
- ----------
1065
- by
1066
- Substring to split by.
1067
- inclusive
1068
- If True, include the split character/string in the results.
1069
-
1070
- Returns
1071
- -------
1072
- Series
1073
- Series of data type `List(String)`.
1074
- """
1075
-
1076
- def split_exact(self, by: IntoExpr, n: int, *, inclusive: bool = False) -> Series:
1077
- """
1078
- Split the string by a substring using `n` splits.
1079
-
1080
- Results in a struct of `n+1` fields.
1081
-
1082
- If it cannot make `n` splits, the remaining field elements will be null.
1083
-
1084
- Parameters
1085
- ----------
1086
- by
1087
- Substring to split by.
1088
- n
1089
- Number of splits to make.
1090
- inclusive
1091
- If True, include the split character/string in the results.
1092
-
1093
- Examples
1094
- --------
1095
- >>> df = pl.DataFrame({"x": ["a_1", None, "c", "d_4"]})
1096
- >>> df["x"].str.split_exact("_", 1).alias("fields")
1097
- shape: (4,)
1098
- Series: 'fields' [struct[2]]
1099
- [
1100
- {"a","1"}
1101
- {null,null}
1102
- {"c",null}
1103
- {"d","4"}
1104
- ]
1105
-
1106
- Split string values in column x in exactly 2 parts and assign
1107
- each part to a new column.
1108
-
1109
- >>> (
1110
- ... df["x"]
1111
- ... .str.split_exact("_", 1)
1112
- ... .struct.rename_fields(["first_part", "second_part"])
1113
- ... .alias("fields")
1114
- ... .to_frame()
1115
- ... .unnest("fields")
1116
- ... )
1117
- shape: (4, 2)
1118
- ┌────────────┬─────────────┐
1119
- │ first_part ┆ second_part │
1120
- │ --- ┆ --- │
1121
- │ str ┆ str │
1122
- ╞════════════╪═════════════╡
1123
- │ a ┆ 1 │
1124
- │ null ┆ null │
1125
- │ c ┆ null │
1126
- │ d ┆ 4 │
1127
- └────────────┴─────────────┘
1128
-
1129
- Returns
1130
- -------
1131
- Series
1132
- Series of data type :class:`Struct` with fields of data type
1133
- :class:`String`.
1134
- """
1135
-
1136
- def splitn(self, by: IntoExpr, n: int) -> Series:
1137
- """
1138
- Split the string by a substring, restricted to returning at most `n` items.
1139
-
1140
- If the number of possible splits is less than `n-1`, the remaining field
1141
- elements will be null. If the number of possible splits is `n-1` or greater,
1142
- the last (nth) substring will contain the remainder of the string.
1143
-
1144
- Parameters
1145
- ----------
1146
- by
1147
- Substring to split by.
1148
- n
1149
- Max number of items to return.
1150
-
1151
- Examples
1152
- --------
1153
- >>> df = pl.DataFrame({"s": ["foo bar", None, "foo-bar", "foo bar baz"]})
1154
- >>> df["s"].str.splitn(" ", 2).alias("fields")
1155
- shape: (4,)
1156
- Series: 'fields' [struct[2]]
1157
- [
1158
- {"foo","bar"}
1159
- {null,null}
1160
- {"foo-bar",null}
1161
- {"foo","bar baz"}
1162
- ]
1163
-
1164
- Split string values in column s in exactly 2 parts and assign
1165
- each part to a new column.
1166
-
1167
- >>> (
1168
- ... df["s"]
1169
- ... .str.splitn(" ", 2)
1170
- ... .struct.rename_fields(["first_part", "second_part"])
1171
- ... .alias("fields")
1172
- ... .to_frame()
1173
- ... .unnest("fields")
1174
- ... )
1175
- shape: (4, 2)
1176
- ┌────────────┬─────────────┐
1177
- │ first_part ┆ second_part │
1178
- │ --- ┆ --- │
1179
- │ str ┆ str │
1180
- ╞════════════╪═════════════╡
1181
- │ foo ┆ bar │
1182
- │ null ┆ null │
1183
- │ foo-bar ┆ null │
1184
- │ foo ┆ bar baz │
1185
- └────────────┴─────────────┘
1186
-
1187
- Returns
1188
- -------
1189
- Series
1190
- Series of data type :class:`Struct` with fields of data type
1191
- :class:`String`.
1192
- """
1193
-
1194
- def replace(
1195
- self, pattern: str, value: str, *, literal: bool = False, n: int = 1
1196
- ) -> Series:
1197
- r"""
1198
- Replace first matching regex/literal substring with a new string value.
1199
-
1200
- Parameters
1201
- ----------
1202
- pattern
1203
- A valid regular expression pattern, compatible with the `regex crate
1204
- <https://docs.rs/regex/latest/regex/>`_.
1205
- value
1206
- String that will replace the matched substring.
1207
- literal
1208
- Treat `pattern` as a literal string.
1209
- n
1210
- Number of matches to replace.
1211
-
1212
- See Also
1213
- --------
1214
- replace_all
1215
-
1216
- Notes
1217
- -----
1218
- * To modify regular expression behaviour (such as case-sensitivity) with flags,
1219
- use the inline `(?iLmsuxU)` syntax. (See the regex crate's section on
1220
- `grouping and flags <https://docs.rs/regex/latest/regex/#grouping-and-flags>`_
1221
- for additional information about the use of inline expression modifiers).
1222
-
1223
- * The dollar sign (`$`) is a special character related to capture groups; if you
1224
- want to replace some target pattern with characters that include a literal `$`
1225
- you should escape it by doubling it up as `$$`, or set `literal=True` if you
1226
- do not need a full regular expression pattern match. Otherwise, you will be
1227
- referencing a (potentially non-existent) capture group.
1228
-
1229
- If not escaped, the `$0` in the replacement value (below) represents a capture
1230
- group:
1231
-
1232
- .. code-block:: python
1233
-
1234
- >>> s = pl.Series("cents", ["000.25", "00.50", "0.75"])
1235
- >>> s.str.replace(r"^(0+)\.", "$0.")
1236
- shape: (3,)
1237
- Series: 'cents' [str]
1238
- [
1239
- "000..25"
1240
- "00..50"
1241
- "0..75"
1242
- ]
1243
-
1244
- To have `$` represent a literal value, it should be doubled-up as `$$`
1245
- (or, for simpler find/replace operations, set `literal=True` if you do
1246
- not require a full regular expression match):
1247
-
1248
- .. code-block:: python
1249
-
1250
- >>> s.str.replace(r"^(0+)\.", "$$0.")
1251
- shape: (3,)
1252
- Series: 'cents' [str]
1253
- [
1254
- "$0.25"
1255
- "$0.50"
1256
- "$0.75"
1257
- ]
1258
-
1259
- Examples
1260
- --------
1261
- >>> s = pl.Series(["123abc", "abc456"])
1262
- >>> s.str.replace(r"abc\b", "ABC")
1263
- shape: (2,)
1264
- Series: '' [str]
1265
- [
1266
- "123ABC"
1267
- "abc456"
1268
- ]
1269
-
1270
- Capture groups are supported. Use `$1` or `${1}` in the `value` string to refer
1271
- to the first capture group in the `pattern`, `$2` or `${2}` to refer to the
1272
- second capture group, and so on. You can also use *named* capture groups.
1273
-
1274
- >>> s = pl.Series(["hat", "hut"])
1275
- >>> s.str.replace("h(.)t", "b${1}d")
1276
- shape: (2,)
1277
- Series: '' [str]
1278
- [
1279
- "bad"
1280
- "bud"
1281
- ]
1282
- >>> s.str.replace("h(?<vowel>.)t", "b${vowel}d")
1283
- shape: (2,)
1284
- Series: '' [str]
1285
- [
1286
- "bad"
1287
- "bud"
1288
- ]
1289
-
1290
- Apply case-insensitive string replacement using the `(?i)` flag.
1291
-
1292
- >>> s = pl.Series("weather", ["Foggy", "Rainy", "Sunny"])
1293
- >>> s.str.replace(r"(?i)foggy|rainy", "Sunny")
1294
- shape: (3,)
1295
- Series: 'weather' [str]
1296
- [
1297
- "Sunny"
1298
- "Sunny"
1299
- "Sunny"
1300
- ]
1301
- """
1302
-
1303
- def replace_all(self, pattern: str, value: str, *, literal: bool = False) -> Series:
1304
- r"""
1305
- Replace all matching regex/literal substrings with a new string value.
1306
-
1307
- Parameters
1308
- ----------
1309
- pattern
1310
- A valid regular expression pattern, compatible with the `regex crate
1311
- <https://docs.rs/regex/latest/regex/>`_.
1312
- value
1313
- String that will replace the matched substring.
1314
- literal
1315
- Treat `pattern` as a literal string.
1316
-
1317
- See Also
1318
- --------
1319
- replace
1320
-
1321
- Notes
1322
- -----
1323
- * To modify regular expression behaviour (such as case-sensitivity) with flags,
1324
- use the inline `(?iLmsuxU)` syntax. (See the regex crate's section on
1325
- `grouping and flags <https://docs.rs/regex/latest/regex/#grouping-and-flags>`_
1326
- for additional information about the use of inline expression modifiers).
1327
-
1328
- * The dollar sign (`$`) is a special character related to capture groups; if you
1329
- want to replace some target pattern with characters that include a literal `$`
1330
- you should escape it by doubling it up as `$$`, or set `literal=True` if you
1331
- do not need a full regular expression pattern match. Otherwise, you will be
1332
- referencing a (potentially non-existent) capture group.
1333
-
1334
- In the example below we need to double up `$` (to represent a literal dollar
1335
- sign, and then refer to the capture group using `$n` or `${n}`, hence the
1336
- three consecutive `$` characters in the replacement value:
1337
-
1338
- .. code-block:: python
1339
-
1340
- >>> s = pl.Series("cost", ["#12.34", "#56.78"])
1341
- >>> s.str.replace_all(r"#(\d+)", "$$${1}").alias("cost_usd")
1342
- shape: (2,)
1343
- Series: 'cost_usd' [str]
1344
- [
1345
- "$12.34"
1346
- "$56.78"
1347
- ]
1348
-
1349
- Examples
1350
- --------
1351
- >>> s = pl.Series(["123abc", "abc456"])
1352
- >>> s.str.replace_all(r"abc\b", "ABC")
1353
- shape: (2,)
1354
- Series: '' [str]
1355
- [
1356
- "123ABC"
1357
- "abc456"
1358
- ]
1359
-
1360
- Capture groups are supported. Use `$1` or `${1}` in the `value` string to refer
1361
- to the first capture group in the `pattern`, `$2` or `${2}` to refer to the
1362
- second capture group, and so on. You can also use *named* capture groups.
1363
-
1364
- >>> s = pl.Series(["hat", "hut"])
1365
- >>> s.str.replace_all("h(.)t", "b${1}d")
1366
- shape: (2,)
1367
- Series: '' [str]
1368
- [
1369
- "bad"
1370
- "bud"
1371
- ]
1372
- >>> s.str.replace_all("h(?<vowel>.)t", "b${vowel}d")
1373
- shape: (2,)
1374
- Series: '' [str]
1375
- [
1376
- "bad"
1377
- "bud"
1378
- ]
1379
-
1380
- Apply case-insensitive string replacement using the `(?i)` flag.
1381
-
1382
- >>> s = pl.Series("weather", ["Foggy", "Rainy", "Sunny"])
1383
- >>> s.str.replace_all(r"(?i)foggy|rainy", "Sunny")
1384
- shape: (3,)
1385
- Series: 'weather' [str]
1386
- [
1387
- "Sunny"
1388
- "Sunny"
1389
- "Sunny"
1390
- ]
1391
- """
1392
-
1393
- def strip_chars(self, characters: IntoExpr = None) -> Series:
1394
- r"""
1395
- Remove leading and trailing characters.
1396
-
1397
- Parameters
1398
- ----------
1399
- characters
1400
- The set of characters to be removed. All combinations of this set of
1401
- characters will be stripped from the start and end of the string. If set to
1402
- None (default), all leading and trailing whitespace is removed instead.
1403
-
1404
- Examples
1405
- --------
1406
- >>> s = pl.Series([" hello ", "\tworld"])
1407
- >>> s.str.strip_chars()
1408
- shape: (2,)
1409
- Series: '' [str]
1410
- [
1411
- "hello"
1412
- "world"
1413
- ]
1414
-
1415
- Characters can be stripped by passing a string as argument. Note that whitespace
1416
- will not be stripped automatically when doing so, unless that whitespace is
1417
- also included in the string.
1418
-
1419
- >>> s.str.strip_chars("o ")
1420
- shape: (2,)
1421
- Series: '' [str]
1422
- [
1423
- "hell"
1424
- " world"
1425
- ]
1426
- """
1427
-
1428
- def strip_chars_start(self, characters: IntoExpr = None) -> Series:
1429
- r"""
1430
- Remove leading characters.
1431
-
1432
- Parameters
1433
- ----------
1434
- characters
1435
- The set of characters to be removed. All combinations of this set of
1436
- characters will be stripped from the start of the string. If set to None
1437
- (default), all leading whitespace is removed instead.
1438
-
1439
- Examples
1440
- --------
1441
- >>> s = pl.Series([" hello ", "\tworld"])
1442
- >>> s.str.strip_chars_start()
1443
- shape: (2,)
1444
- Series: '' [str]
1445
- [
1446
- "hello "
1447
- "world"
1448
- ]
1449
-
1450
- Characters can be stripped by passing a string as argument. Note that whitespace
1451
- will not be stripped automatically when doing so.
1452
-
1453
- >>> s.str.strip_chars_start("wod\t")
1454
- shape: (2,)
1455
- Series: '' [str]
1456
- [
1457
- " hello "
1458
- "rld"
1459
- ]
1460
- """
1461
-
1462
- def strip_chars_end(self, characters: IntoExpr = None) -> Series:
1463
- r"""
1464
- Remove trailing characters.
1465
-
1466
- Parameters
1467
- ----------
1468
- characters
1469
- The set of characters to be removed. All combinations of this set of
1470
- characters will be stripped from the end of the string. If set to None
1471
- (default), all trailing whitespace is removed instead.
1472
-
1473
- Examples
1474
- --------
1475
- >>> s = pl.Series([" hello ", "world\t"])
1476
- >>> s.str.strip_chars_end()
1477
- shape: (2,)
1478
- Series: '' [str]
1479
- [
1480
- " hello"
1481
- "world"
1482
- ]
1483
-
1484
- Characters can be stripped by passing a string as argument. Note that whitespace
1485
- will not be stripped automatically when doing so.
1486
-
1487
- >>> s.str.strip_chars_end("orld\t")
1488
- shape: (2,)
1489
- Series: '' [str]
1490
- [
1491
- " hello "
1492
- "w"
1493
- ]
1494
- """
1495
-
1496
- def strip_prefix(self, prefix: IntoExpr) -> Series:
1497
- """
1498
- Remove prefix.
1499
-
1500
- The prefix will be removed from the string exactly once, if found.
1501
-
1502
- Parameters
1503
- ----------
1504
- prefix
1505
- The prefix to be removed.
1506
-
1507
- Examples
1508
- --------
1509
- >>> s = pl.Series(["foobar", "foofoobar", "foo", "bar"])
1510
- >>> s.str.strip_prefix("foo")
1511
- shape: (4,)
1512
- Series: '' [str]
1513
- [
1514
- "bar"
1515
- "foobar"
1516
- ""
1517
- "bar"
1518
- ]
1519
- """
1520
-
1521
- def strip_suffix(self, suffix: IntoExpr) -> Series:
1522
- """
1523
- Remove suffix.
1524
-
1525
- The suffix will be removed from the string exactly once, if found.
1526
-
1527
- Parameters
1528
- ----------
1529
- suffix
1530
- The suffix to be removed.
1531
-
1532
- Examples
1533
- --------
1534
- >>> s = pl.Series(["foobar", "foobarbar", "foo", "bar"])
1535
- >>> s.str.strip_suffix("bar")
1536
- shape: (4,)
1537
- Series: '' [str]
1538
- [
1539
- "foo"
1540
- "foobar"
1541
- "foo"
1542
- ""
1543
- ]
1544
- """
1545
-
1546
- def pad_start(self, length: int | IntoExprColumn, fill_char: str = " ") -> Series:
1547
- """
1548
- Pad the start of the string until it reaches the given length.
1549
-
1550
- Parameters
1551
- ----------
1552
- length
1553
- Pad the string until it reaches this length. Strings with length equal to or
1554
- greater than this value are returned as-is.
1555
- fill_char
1556
- The character to pad the string with.
1557
-
1558
- See Also
1559
- --------
1560
- pad_end
1561
- zfill
1562
-
1563
- Examples
1564
- --------
1565
- >>> s = pl.Series("a", ["cow", "monkey", "hippopotamus", None])
1566
- >>> s.str.pad_start(8, "*")
1567
- shape: (4,)
1568
- Series: 'a' [str]
1569
- [
1570
- "*****cow"
1571
- "**monkey"
1572
- "hippopotamus"
1573
- null
1574
- ]
1575
- """
1576
-
1577
- def pad_end(self, length: int | IntoExprColumn, fill_char: str = " ") -> Series:
1578
- """
1579
- Pad the end of the string until it reaches the given length.
1580
-
1581
- Parameters
1582
- ----------
1583
- length
1584
- Pad the string until it reaches this length. Strings with length equal to or
1585
- greater than this value are returned as-is.
1586
- fill_char
1587
- The character to pad the string with.
1588
-
1589
- See Also
1590
- --------
1591
- pad_start
1592
-
1593
- Examples
1594
- --------
1595
- >>> s = pl.Series(["cow", "monkey", "hippopotamus", None])
1596
- >>> s.str.pad_end(8, "*")
1597
- shape: (4,)
1598
- Series: '' [str]
1599
- [
1600
- "cow*****"
1601
- "monkey**"
1602
- "hippopotamus"
1603
- null
1604
- ]
1605
- """
1606
-
1607
- def zfill(self, length: int | IntoExprColumn) -> Series:
1608
- """
1609
- Pad the start of the string with zeros until it reaches the given length.
1610
-
1611
- A sign prefix (`-`) is handled by inserting the padding after the sign character
1612
- rather than before.
1613
-
1614
- Parameters
1615
- ----------
1616
- length
1617
- Pad the string until it reaches this length. Strings with length equal to or
1618
- greater than this value are returned as-is.
1619
-
1620
- See Also
1621
- --------
1622
- pad_start
1623
-
1624
- Notes
1625
- -----
1626
- This method is intended for padding numeric strings. If your data contains
1627
- non-ASCII characters, use :func:`pad_start` instead.
1628
-
1629
- Examples
1630
- --------
1631
- >>> s = pl.Series([-1, 123, 999999, None])
1632
- >>> s.cast(pl.String).str.zfill(4)
1633
- shape: (4,)
1634
- Series: '' [str]
1635
- [
1636
- "-001"
1637
- "0123"
1638
- "999999"
1639
- null
1640
- ]
1641
- """
1642
-
1643
- def to_lowercase(self) -> Series:
1644
- """
1645
- Modify strings to their lowercase equivalent.
1646
-
1647
- Examples
1648
- --------
1649
- >>> s = pl.Series("foo", ["CAT", "DOG"])
1650
- >>> s.str.to_lowercase()
1651
- shape: (2,)
1652
- Series: 'foo' [str]
1653
- [
1654
- "cat"
1655
- "dog"
1656
- ]
1657
- """
1658
-
1659
- def to_uppercase(self) -> Series:
1660
- """
1661
- Modify strings to their uppercase equivalent.
1662
-
1663
- Examples
1664
- --------
1665
- >>> s = pl.Series("foo", ["cat", "dog"])
1666
- >>> s.str.to_uppercase()
1667
- shape: (2,)
1668
- Series: 'foo' [str]
1669
- [
1670
- "CAT"
1671
- "DOG"
1672
- ]
1673
- """
1674
-
1675
- def to_titlecase(self) -> Series:
1676
- """
1677
- Modify strings to their titlecase equivalent.
1678
-
1679
- Notes
1680
- -----
1681
- This is a form of case transform where the first letter of each word is
1682
- capitalized, with the rest of the word in lowercase. Non-alphanumeric
1683
- characters define the word boundaries.
1684
-
1685
- Examples
1686
- --------
1687
- >>> s = pl.Series(
1688
- ... "quotes",
1689
- ... [
1690
- ... "'e.t. phone home'",
1691
- ... "you talkin' to me?",
1692
- ... "to infinity,and BEYOND!",
1693
- ... ],
1694
- ... )
1695
- >>> s.str.to_titlecase()
1696
- shape: (3,)
1697
- Series: 'quotes' [str]
1698
- [
1699
- "'E.T. Phone Home'"
1700
- "You Talkin' To Me?"
1701
- "To Infinity,And Beyond!"
1702
- ]
1703
- """
1704
-
1705
- def reverse(self) -> Series:
1706
- """
1707
- Returns string values in reversed order.
1708
-
1709
- Examples
1710
- --------
1711
- >>> s = pl.Series("text", ["foo", "bar", "man\u0303ana"])
1712
- >>> s.str.reverse()
1713
- shape: (3,)
1714
- Series: 'text' [str]
1715
- [
1716
- "oof"
1717
- "rab"
1718
- "anañam"
1719
- ]
1720
- """
1721
-
1722
- def slice(
1723
- self, offset: int | IntoExprColumn, length: int | IntoExprColumn | None = None
1724
- ) -> Series:
1725
- """
1726
- Extract a substring from each string value.
1727
-
1728
- Parameters
1729
- ----------
1730
- offset
1731
- Start index. Negative indexing is supported.
1732
- length
1733
- Length of the slice. If set to `None` (default), the slice is taken to the
1734
- end of the string.
1735
-
1736
- Returns
1737
- -------
1738
- Series
1739
- Series of data type :class:`String`.
1740
-
1741
- Notes
1742
- -----
1743
- Both the `offset` and `length` inputs are defined in terms of the number
1744
- of characters in the (UTF8) string. A character is defined as a
1745
- `Unicode scalar value`_. A single character is represented by a single byte
1746
- when working with ASCII text, and a maximum of 4 bytes otherwise.
1747
-
1748
- .. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value
1749
-
1750
- Examples
1751
- --------
1752
- >>> s = pl.Series(["pear", None, "papaya", "dragonfruit"])
1753
- >>> s.str.slice(-3)
1754
- shape: (4,)
1755
- Series: '' [str]
1756
- [
1757
- "ear"
1758
- null
1759
- "aya"
1760
- "uit"
1761
- ]
1762
-
1763
- Using the optional `length` parameter
1764
-
1765
- >>> s.str.slice(4, length=3)
1766
- shape: (4,)
1767
- Series: '' [str]
1768
- [
1769
- ""
1770
- null
1771
- "ya"
1772
- "onf"
1773
- ]
1774
- """
1775
-
1776
- def head(self, n: int | IntoExprColumn) -> Series:
1777
- """
1778
- Return the first n characters of each string in a String Series.
1779
-
1780
- Parameters
1781
- ----------
1782
- n
1783
- Length of the slice (integer or expression). Negative indexing is supported;
1784
- see note (2) below.
1785
-
1786
- Returns
1787
- -------
1788
- Series
1789
- Series of data type :class:`String`.
1790
-
1791
- Notes
1792
- -----
1793
- 1) The `n` input is defined in terms of the number of characters in the (UTF8)
1794
- string. A character is defined as a `Unicode scalar value`_. A single
1795
- character is represented by a single byte when working with ASCII text, and a
1796
- maximum of 4 bytes otherwise.
1797
-
1798
- .. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value
1799
-
1800
- 2) When `n` is negative, `head` returns characters up to the `n`th from the end
1801
- of the string. For example, if `n = -3`, then all characters except the last
1802
- three are returned.
1803
-
1804
- 3) If the length of the string has fewer than `n` characters, the full string is
1805
- returned.
1806
-
1807
- Examples
1808
- --------
1809
- Return up to the first 5 characters.
1810
-
1811
- >>> s = pl.Series(["pear", None, "papaya", "dragonfruit"])
1812
- >>> s.str.head(5)
1813
- shape: (4,)
1814
- Series: '' [str]
1815
- [
1816
- "pear"
1817
- null
1818
- "papay"
1819
- "drago"
1820
- ]
1821
-
1822
- Return up to the 3rd character from the end.
1823
-
1824
- >>> s = pl.Series(["pear", None, "papaya", "dragonfruit"])
1825
- >>> s.str.head(-3)
1826
- shape: (4,)
1827
- Series: '' [str]
1828
- [
1829
- "p"
1830
- null
1831
- "pap"
1832
- "dragonfr"
1833
- ]
1834
- """
1835
-
1836
- def tail(self, n: int | IntoExprColumn) -> Series:
1837
- """
1838
- Return the last n characters of each string in a String Series.
1839
-
1840
- Parameters
1841
- ----------
1842
- n
1843
- Length of the slice (integer or expression). Negative indexing is supported;
1844
- see note (2) below.
1845
-
1846
- Returns
1847
- -------
1848
- Series
1849
- Series of data type :class:`String`.
1850
-
1851
- Notes
1852
- -----
1853
- 1) The `n` input is defined in terms of the number of characters in the (UTF8)
1854
- string. A character is defined as a `Unicode scalar value`_. A single
1855
- character is represented by a single byte when working with ASCII text, and a
1856
- maximum of 4 bytes otherwise.
1857
-
1858
- .. _Unicode scalar value: https://www.unicode.org/glossary/#unicode_scalar_value
1859
-
1860
- 2) When `n` is negative, `tail` returns characters starting from the `n`th from
1861
- the beginning of the string. For example, if `n = -3`, then all characters
1862
- except the first three are returned.
1863
-
1864
- 3) If the length of the string has fewer than `n` characters, the full string is
1865
- returned.
1866
-
1867
- Examples
1868
- --------
1869
- Return up to the last 5 characters:
1870
-
1871
- >>> s = pl.Series(["pear", None, "papaya", "dragonfruit"])
1872
- >>> s.str.tail(5)
1873
- shape: (4,)
1874
- Series: '' [str]
1875
- [
1876
- "pear"
1877
- null
1878
- "apaya"
1879
- "fruit"
1880
- ]
1881
-
1882
- Return from the 3rd character to the end:
1883
-
1884
- >>> s = pl.Series(["pear", None, "papaya", "dragonfruit"])
1885
- >>> s.str.tail(-3)
1886
- shape: (4,)
1887
- Series: '' [str]
1888
- [
1889
- "r"
1890
- null
1891
- "aya"
1892
- "gonfruit"
1893
- ]
1894
- """
1895
-
1896
- @deprecated(
1897
- '`Series.str.explode` is deprecated; use `Series.str.split("").explode()` instead. '
1898
- "Note that empty strings will result in null instead of being preserved. To get "
1899
- "the exact same behavior, split first and then use a `pl.when...then...otherwise` "
1900
- "expression to handle the empty list before exploding. "
1901
- )
1902
- def explode(self) -> Series:
1903
- """
1904
- Returns a column with a separate row for every string character.
1905
-
1906
- .. deprecated:: 0.20.31
1907
- Use the `.str.split("").explode()` method instead. Note that empty strings
1908
- will result in null instead of being preserved. To get the exact same
1909
- behavior, split first and then use a `pl.when...then...otherwise`
1910
- expression to handle the empty list before exploding.
1911
-
1912
- Returns
1913
- -------
1914
- Series
1915
- Series of data type :class:`String`.
1916
-
1917
- Examples
1918
- --------
1919
- >>> s = pl.Series("a", ["foo", "bar"])
1920
- >>> s.str.explode() # doctest: +SKIP
1921
- shape: (6,)
1922
- Series: 'a' [str]
1923
- [
1924
- "f"
1925
- "o"
1926
- "o"
1927
- "b"
1928
- "a"
1929
- "r"
1930
- ]
1931
- """
1932
-
1933
- def to_integer(
1934
- self,
1935
- *,
1936
- base: int | IntoExprColumn = 10,
1937
- dtype: PolarsIntegerType = Int64,
1938
- strict: bool = True,
1939
- ) -> Series:
1940
- """
1941
- Convert an String column into a column of dtype with base radix.
1942
-
1943
- Parameters
1944
- ----------
1945
- base
1946
- Positive integer or expression which is the base of the string
1947
- we are parsing.
1948
- Default: 10.
1949
- dtype
1950
- Polars integer type to cast to.
1951
- Default: :class:`Int64`.
1952
- strict
1953
- Bool, Default=True will raise any ParseError or overflow as ComputeError.
1954
- False silently convert to Null.
1955
-
1956
- Returns
1957
- -------
1958
- Series
1959
- Series of data.
1960
-
1961
- Examples
1962
- --------
1963
- >>> s = pl.Series("bin", ["110", "101", "010", "invalid"])
1964
- >>> s.str.to_integer(base=2, dtype=pl.Int32, strict=False)
1965
- shape: (4,)
1966
- Series: 'bin' [i32]
1967
- [
1968
- 6
1969
- 5
1970
- 2
1971
- null
1972
- ]
1973
-
1974
- >>> s = pl.Series("hex", ["fa1e", "ff00", "cafe", None])
1975
- >>> s.str.to_integer(base=16)
1976
- shape: (4,)
1977
- Series: 'hex' [i64]
1978
- [
1979
- 64030
1980
- 65280
1981
- 51966
1982
- null
1983
- ]
1984
- """
1985
-
1986
- def contains_any(
1987
- self, patterns: Series | list[str], *, ascii_case_insensitive: bool = False
1988
- ) -> Series:
1989
- """
1990
- Use the Aho-Corasick algorithm to find matches.
1991
-
1992
- Determines if any of the patterns are contained in the string.
1993
-
1994
- Parameters
1995
- ----------
1996
- patterns
1997
- String patterns to search.
1998
- ascii_case_insensitive
1999
- Enable ASCII-aware case-insensitive matching.
2000
- When this option is enabled, searching will be performed without respect
2001
- to case for ASCII letters (a-z and A-Z) only.
2002
-
2003
- Notes
2004
- -----
2005
- This method supports matching on string literals only, and does not support
2006
- regular expression matching.
2007
-
2008
- Examples
2009
- --------
2010
- >>> _ = pl.Config.set_fmt_str_lengths(100)
2011
- >>> s = pl.Series(
2012
- ... "lyrics",
2013
- ... [
2014
- ... "Everybody wants to rule the world",
2015
- ... "Tell me what you want, what you really really want",
2016
- ... "Can you feel the love tonight",
2017
- ... ],
2018
- ... )
2019
- >>> s.str.contains_any(["you", "me"])
2020
- shape: (3,)
2021
- Series: 'lyrics' [bool]
2022
- [
2023
- false
2024
- true
2025
- true
2026
- ]
2027
- """
2028
-
2029
- def replace_many(
2030
- self,
2031
- patterns: Series | list[str] | Mapping[str, str],
2032
- replace_with: Series | list[str] | str | NoDefault = no_default,
2033
- *,
2034
- ascii_case_insensitive: bool = False,
2035
- ) -> Series:
2036
- """
2037
- Use the Aho-Corasick algorithm to replace many matches.
2038
-
2039
- Parameters
2040
- ----------
2041
- patterns
2042
- String patterns to search and replace.
2043
- Also accepts a mapping of patterns to their replacement as syntactic sugar
2044
- for `replace_many(pl.Series(mapping.keys()), pl.Series(mapping.values()))`.
2045
- replace_with
2046
- Strings to replace where a pattern was a match.
2047
- Length must match the length of `patterns` or have length 1. This can be
2048
- broadcasted, so it supports many:one and many:many.
2049
- ascii_case_insensitive
2050
- Enable ASCII-aware case-insensitive matching.
2051
- When this option is enabled, searching will be performed without respect
2052
- to case for ASCII letters (a-z and A-Z) only.
2053
-
2054
- Notes
2055
- -----
2056
- This method supports matching on string literals only, and does not support
2057
- regular expression matching.
2058
-
2059
- Examples
2060
- --------
2061
- Replace many patterns by passing lists of equal length to the `patterns` and
2062
- `replace_with` parameters.
2063
-
2064
- >>> _ = pl.Config.set_fmt_str_lengths(100)
2065
- >>> s = pl.Series(
2066
- ... "lyrics",
2067
- ... [
2068
- ... "Everybody wants to rule the world",
2069
- ... "Tell me what you want, what you really really want",
2070
- ... "Can you feel the love tonight",
2071
- ... ],
2072
- ... )
2073
- >>> s.str.replace_many(["you", "me"], ["me", "you"])
2074
- shape: (3,)
2075
- Series: 'lyrics' [str]
2076
- [
2077
- "Everybody wants to rule the world"
2078
- "Tell you what me want, what me really really want"
2079
- "Can me feel the love tonight"
2080
- ]
2081
-
2082
- Broadcast a replacement for many patterns by passing a sequence of length 1 to
2083
- the `replace_with` parameter.
2084
-
2085
- >>> _ = pl.Config.set_fmt_str_lengths(100)
2086
- >>> s = pl.Series(
2087
- ... "lyrics",
2088
- ... [
2089
- ... "Everybody wants to rule the world",
2090
- ... "Tell me what you want, what you really really want",
2091
- ... "Can you feel the love tonight",
2092
- ... ],
2093
- ... )
2094
- >>> s.str.replace_many(["me", "you", "they"], [""])
2095
- shape: (3,)
2096
- Series: 'lyrics' [str]
2097
- [
2098
- "Everybody wants to rule the world"
2099
- "Tell what want, what really really want"
2100
- "Can feel the love tonight"
2101
- ]
2102
-
2103
- Passing a mapping with patterns and replacements is also supported as syntactic
2104
- sugar.
2105
-
2106
- >>> _ = pl.Config.set_fmt_str_lengths(100)
2107
- >>> s = pl.Series(
2108
- ... "lyrics",
2109
- ... [
2110
- ... "Everybody wants to rule the world",
2111
- ... "Tell me what you want, what you really really want",
2112
- ... "Can you feel the love tonight",
2113
- ... ],
2114
- ... )
2115
- >>> mapping = {"me": "you", "you": "me", "want": "need"}
2116
- >>> s.str.replace_many(mapping)
2117
- shape: (3,)
2118
- Series: 'lyrics' [str]
2119
- [
2120
- "Everybody needs to rule the world"
2121
- "Tell you what me need, what me really really need"
2122
- "Can me feel the love tonight"
2123
- ]
2124
- """
2125
-
2126
- @unstable()
2127
- def extract_many(
2128
- self,
2129
- patterns: Series | list[str],
2130
- *,
2131
- ascii_case_insensitive: bool = False,
2132
- overlapping: bool = False,
2133
- ) -> Series:
2134
- """
2135
- Use the Aho-Corasick algorithm to extract many matches.
2136
-
2137
- Parameters
2138
- ----------
2139
- patterns
2140
- String patterns to search.
2141
- ascii_case_insensitive
2142
- Enable ASCII-aware case-insensitive matching.
2143
- When this option is enabled, searching will be performed without respect
2144
- to case for ASCII letters (a-z and A-Z) only.
2145
- overlapping
2146
- Whether matches may overlap.
2147
-
2148
- Notes
2149
- -----
2150
- This method supports matching on string literals only, and does not support
2151
- regular expression matching.
2152
-
2153
- Examples
2154
- --------
2155
- >>> s = pl.Series("values", ["discontent"])
2156
- >>> patterns = ["winter", "disco", "onte", "discontent"]
2157
- >>> s.str.extract_many(patterns, overlapping=True)
2158
- shape: (1,)
2159
- Series: 'values' [list[str]]
2160
- [
2161
- ["disco", "onte", "discontent"]
2162
- ]
2163
-
2164
- """
2165
-
2166
- @unstable()
2167
- def find_many(
2168
- self,
2169
- patterns: IntoExpr,
2170
- *,
2171
- ascii_case_insensitive: bool = False,
2172
- overlapping: bool = False,
2173
- ) -> Series:
2174
- """
2175
- Use the Aho-Corasick algorithm to find all matches.
2176
-
2177
- The function returns the byte offset of the start of each match.
2178
- The return type will be `List<UInt32>`
2179
-
2180
- Parameters
2181
- ----------
2182
- patterns
2183
- String patterns to search.
2184
- ascii_case_insensitive
2185
- Enable ASCII-aware case-insensitive matching.
2186
- When this option is enabled, searching will be performed without respect
2187
- to case for ASCII letters (a-z and A-Z) only.
2188
- overlapping
2189
- Whether matches may overlap.
2190
-
2191
- Notes
2192
- -----
2193
- This method supports matching on string literals only, and does not support
2194
- regular expression matching.
2195
-
2196
- Examples
2197
- --------
2198
- >>> _ = pl.Config.set_fmt_str_lengths(100)
2199
- >>> df = pl.DataFrame({"values": ["discontent"]})
2200
- >>> patterns = ["winter", "disco", "onte", "discontent"]
2201
- >>> df.with_columns(
2202
- ... pl.col("values")
2203
- ... .str.extract_many(patterns, overlapping=False)
2204
- ... .alias("matches"),
2205
- ... pl.col("values")
2206
- ... .str.extract_many(patterns, overlapping=True)
2207
- ... .alias("matches_overlapping"),
2208
- ... )
2209
- shape: (1, 3)
2210
- ┌────────────┬───────────┬─────────────────────────────────┐
2211
- │ values ┆ matches ┆ matches_overlapping │
2212
- │ --- ┆ --- ┆ --- │
2213
- │ str ┆ list[str] ┆ list[str] │
2214
- ╞════════════╪═══════════╪═════════════════════════════════╡
2215
- │ discontent ┆ ["disco"] ┆ ["disco", "onte", "discontent"] │
2216
- └────────────┴───────────┴─────────────────────────────────┘
2217
- >>> df = pl.DataFrame(
2218
- ... {
2219
- ... "values": ["discontent", "rhapsody"],
2220
- ... "patterns": [
2221
- ... ["winter", "disco", "onte", "discontent"],
2222
- ... ["rhap", "ody", "coalesce"],
2223
- ... ],
2224
- ... }
2225
- ... )
2226
- >>> df.select(pl.col("values").str.find_many("patterns"))
2227
- shape: (2, 1)
2228
- ┌───────────┐
2229
- │ values │
2230
- │ --- │
2231
- │ list[u32] │
2232
- ╞═══════════╡
2233
- │ [0] │
2234
- │ [0, 5] │
2235
- └───────────┘
2236
- """
2237
-
2238
- def join(self, delimiter: str = "", *, ignore_nulls: bool = True) -> Series:
2239
- """
2240
- Vertically concatenate the string values in the column to a single string value.
2241
-
2242
- Parameters
2243
- ----------
2244
- delimiter
2245
- The delimiter to insert between consecutive string values.
2246
- ignore_nulls
2247
- Ignore null values (default).
2248
- If set to `False`, null values will be propagated. This means that
2249
- if the column contains any null values, the output is null.
2250
-
2251
- Returns
2252
- -------
2253
- Series
2254
- Series of data type :class:`String`.
2255
-
2256
- Examples
2257
- --------
2258
- >>> s = pl.Series([1, None, 3])
2259
- >>> s.str.join("-")
2260
- shape: (1,)
2261
- Series: '' [str]
2262
- [
2263
- "1-3"
2264
- ]
2265
- >>> s.str.join(ignore_nulls=False)
2266
- shape: (1,)
2267
- Series: '' [str]
2268
- [
2269
- null
2270
- ]
2271
- """
2272
-
2273
- @deprecated(
2274
- "`Series.str.concat` is deprecated; use `Series.str.join` instead. Note also "
2275
- "that the default `delimiter` for `str.join` is an empty string, not a hyphen."
2276
- )
2277
- def concat(
2278
- self, delimiter: str | None = None, *, ignore_nulls: bool = True
2279
- ) -> Series:
2280
- """
2281
- Vertically concatenate the string values in the column to a single string value.
2282
-
2283
- .. deprecated:: 1.0.0
2284
- Use :meth:`join` instead. Note that the default `delimiter` for :meth:`join`
2285
- is an empty string instead of a hyphen.
2286
-
2287
- Parameters
2288
- ----------
2289
- delimiter
2290
- The delimiter to insert between consecutive string values.
2291
- ignore_nulls
2292
- Ignore null values (default).
2293
- If set to `False`, null values will be propagated. This means that
2294
- if the column contains any null values, the output is null.
2295
-
2296
- Returns
2297
- -------
2298
- Series
2299
- Series of data type :class:`String`.
2300
-
2301
- Examples
2302
- --------
2303
- >>> pl.Series([1, None, 2]).str.concat("-") # doctest: +SKIP
2304
- shape: (1,)
2305
- Series: '' [str]
2306
- [
2307
- "1-2"
2308
- ]
2309
- >>> pl.Series([1, None, 2]).str.concat(ignore_nulls=False) # doctest: +SKIP
2310
- shape: (1,)
2311
- Series: '' [str]
2312
- [
2313
- null
2314
- ]
2315
- """
2316
-
2317
- def escape_regex(self) -> Series:
2318
- r"""
2319
- Returns string values with all regular expression meta characters escaped.
2320
-
2321
- Returns
2322
- -------
2323
- Series
2324
- Series of data type :class:`String`.
2325
-
2326
- Examples
2327
- --------
2328
- >>> pl.Series(["abc", "def", None, "abc(\\w+)"]).str.escape_regex()
2329
- shape: (4,)
2330
- Series: '' [str]
2331
- [
2332
- "abc"
2333
- "def"
2334
- null
2335
- "abc\(\\w\+\)"
2336
- ]
2337
- """
2338
-
2339
- def normalize(self, form: UnicodeForm = "NFC") -> Series:
2340
- """
2341
- Returns the Unicode normal form of the string values.
2342
-
2343
- This uses the forms described in Unicode Standard Annex 15: <https://www.unicode.org/reports/tr15/>.
2344
-
2345
- Parameters
2346
- ----------
2347
- form : {'NFC', 'NFKC', 'NFD', 'NFKD'}
2348
- Unicode form to use.
2349
-
2350
- Examples
2351
- --------
2352
- >>> s = pl.Series(["01²", "KADOKAWA"])
2353
- >>> s.str.normalize("NFC")
2354
- shape: (2,)
2355
- Series: '' [str]
2356
- [
2357
- "01²"
2358
- "KADOKAWA"
2359
- ]
2360
- >>> s.str.normalize("NFKC")
2361
- shape: (2,)
2362
- Series: '' [str]
2363
- [
2364
- "012"
2365
- "KADOKAWA"
2366
- ]
2367
- """ # noqa: RUF002