polars-df 0.13.0-x64-mingw-ucrt
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.yardopts +3 -0
- data/CHANGELOG.md +208 -0
- data/Cargo.lock +2556 -0
- data/Cargo.toml +6 -0
- data/LICENSE-THIRD-PARTY.txt +39278 -0
- data/LICENSE.txt +20 -0
- data/README.md +437 -0
- data/lib/polars/3.1/polars.so +0 -0
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/3.3/polars.so +0 -0
- data/lib/polars/array_expr.rb +537 -0
- data/lib/polars/array_name_space.rb +423 -0
- data/lib/polars/batched_csv_reader.rb +104 -0
- data/lib/polars/binary_expr.rb +77 -0
- data/lib/polars/binary_name_space.rb +66 -0
- data/lib/polars/cat_expr.rb +36 -0
- data/lib/polars/cat_name_space.rb +88 -0
- data/lib/polars/config.rb +530 -0
- data/lib/polars/convert.rb +98 -0
- data/lib/polars/data_frame.rb +5191 -0
- data/lib/polars/data_types.rb +466 -0
- data/lib/polars/date_time_expr.rb +1397 -0
- data/lib/polars/date_time_name_space.rb +1287 -0
- data/lib/polars/dynamic_group_by.rb +52 -0
- data/lib/polars/exceptions.rb +38 -0
- data/lib/polars/expr.rb +7256 -0
- data/lib/polars/expr_dispatch.rb +22 -0
- data/lib/polars/functions/aggregation/horizontal.rb +246 -0
- data/lib/polars/functions/aggregation/vertical.rb +282 -0
- data/lib/polars/functions/as_datatype.rb +271 -0
- data/lib/polars/functions/col.rb +47 -0
- data/lib/polars/functions/eager.rb +182 -0
- data/lib/polars/functions/lazy.rb +1329 -0
- data/lib/polars/functions/len.rb +49 -0
- data/lib/polars/functions/lit.rb +35 -0
- data/lib/polars/functions/random.rb +16 -0
- data/lib/polars/functions/range/date_range.rb +136 -0
- data/lib/polars/functions/range/datetime_range.rb +149 -0
- data/lib/polars/functions/range/int_range.rb +51 -0
- data/lib/polars/functions/range/time_range.rb +141 -0
- data/lib/polars/functions/repeat.rb +144 -0
- data/lib/polars/functions/whenthen.rb +96 -0
- data/lib/polars/functions.rb +57 -0
- data/lib/polars/group_by.rb +613 -0
- data/lib/polars/io/avro.rb +24 -0
- data/lib/polars/io/csv.rb +696 -0
- data/lib/polars/io/database.rb +73 -0
- data/lib/polars/io/ipc.rb +275 -0
- data/lib/polars/io/json.rb +29 -0
- data/lib/polars/io/ndjson.rb +80 -0
- data/lib/polars/io/parquet.rb +233 -0
- data/lib/polars/lazy_frame.rb +2708 -0
- data/lib/polars/lazy_group_by.rb +181 -0
- data/lib/polars/list_expr.rb +791 -0
- data/lib/polars/list_name_space.rb +449 -0
- data/lib/polars/meta_expr.rb +222 -0
- data/lib/polars/name_expr.rb +198 -0
- data/lib/polars/plot.rb +109 -0
- data/lib/polars/rolling_group_by.rb +35 -0
- data/lib/polars/series.rb +4444 -0
- data/lib/polars/slice.rb +104 -0
- data/lib/polars/sql_context.rb +194 -0
- data/lib/polars/string_cache.rb +75 -0
- data/lib/polars/string_expr.rb +1495 -0
- data/lib/polars/string_name_space.rb +811 -0
- data/lib/polars/struct_expr.rb +98 -0
- data/lib/polars/struct_name_space.rb +96 -0
- data/lib/polars/testing.rb +507 -0
- data/lib/polars/utils/constants.rb +9 -0
- data/lib/polars/utils/convert.rb +97 -0
- data/lib/polars/utils/parse.rb +89 -0
- data/lib/polars/utils/various.rb +76 -0
- data/lib/polars/utils/wrap.rb +19 -0
- data/lib/polars/utils.rb +130 -0
- data/lib/polars/version.rb +4 -0
- data/lib/polars/whenthen.rb +83 -0
- data/lib/polars-df.rb +1 -0
- data/lib/polars.rb +91 -0
- metadata +138 -0
@@ -0,0 +1,1495 @@
|
|
1
|
+
module Polars
|
2
|
+
# Namespace for string related expressions.
|
3
|
+
class StringExpr
|
4
|
+
# @private
|
5
|
+
attr_accessor :_rbexpr
|
6
|
+
|
7
|
+
# @private
|
8
|
+
def initialize(expr)
|
9
|
+
self._rbexpr = expr._rbexpr
|
10
|
+
end
|
11
|
+
|
12
|
+
# Convert a Utf8 column into a Date column.
|
13
|
+
#
|
14
|
+
# @param format [String]
|
15
|
+
# Format to use for conversion. Refer to the
|
16
|
+
# [chrono crate documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
|
17
|
+
# for the full specification. Example: `"%Y-%m-%d"`.
|
18
|
+
# If set to nil (default), the format is inferred from the data.
|
19
|
+
# @param strict [Boolean]
|
20
|
+
# Raise an error if any conversion fails.
|
21
|
+
# @param exact [Boolean]
|
22
|
+
# Require an exact format match. If false, allow the format to match anywhere
|
23
|
+
# in the target string.
|
24
|
+
# @param cache [Boolean]
|
25
|
+
# Use a cache of unique, converted dates to apply the conversion.
|
26
|
+
#
|
27
|
+
# @return [Expr]
|
28
|
+
#
|
29
|
+
# @example
|
30
|
+
# s = Polars::Series.new(["2020/01/01", "2020/02/01", "2020/03/01"])
|
31
|
+
# s.str.to_date
|
32
|
+
# # =>
|
33
|
+
# # shape: (3,)
|
34
|
+
# # Series: '' [date]
|
35
|
+
# # [
|
36
|
+
# # 2020-01-01
|
37
|
+
# # 2020-02-01
|
38
|
+
# # 2020-03-01
|
39
|
+
# # ]
|
40
|
+
def to_date(format = nil, strict: true, exact: true, cache: true)
|
41
|
+
_validate_format_argument(format)
|
42
|
+
Utils.wrap_expr(_rbexpr.str_to_date(format, strict, exact, cache))
|
43
|
+
end
|
44
|
+
|
45
|
+
# Convert a Utf8 column into a Datetime column.
|
46
|
+
#
|
47
|
+
# @param format [String]
|
48
|
+
# Format to use for conversion. Refer to the
|
49
|
+
# [chrono crate documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
|
50
|
+
# for the full specification. Example: `"%Y-%m-%d %H:%M:%S"`.
|
51
|
+
# If set to nil (default), the format is inferred from the data.
|
52
|
+
# @param time_unit ["us", "ns", "ms"]
|
53
|
+
# Unit of time for the resulting Datetime column. If set to nil (default),
|
54
|
+
# the time unit is inferred from the format string if given, eg:
|
55
|
+
# `"%F %T%.3f"` => `Datetime("ms")`. If no fractional second component is
|
56
|
+
# found, the default is `"us"`.
|
57
|
+
# @param time_zone [String]
|
58
|
+
# Time zone for the resulting Datetime column.
|
59
|
+
# @param strict [Boolean]
|
60
|
+
# Raise an error if any conversion fails.
|
61
|
+
# @param exact [Boolean]
|
62
|
+
# Require an exact format match. If false, allow the format to match anywhere
|
63
|
+
# in the target string.
|
64
|
+
# @param cache [Boolean]
|
65
|
+
# Use a cache of unique, converted datetimes to apply the conversion.
|
66
|
+
#
|
67
|
+
# @return [Expr]
|
68
|
+
#
|
69
|
+
# @example
|
70
|
+
# s = Polars::Series.new(["2020-01-01 01:00Z", "2020-01-01 02:00Z"])
|
71
|
+
# s.str.to_datetime("%Y-%m-%d %H:%M%#z")
|
72
|
+
# # =>
|
73
|
+
# # shape: (2,)
|
74
|
+
# # Series: '' [datetime[μs, UTC]]
|
75
|
+
# # [
|
76
|
+
# # 2020-01-01 01:00:00 UTC
|
77
|
+
# # 2020-01-01 02:00:00 UTC
|
78
|
+
# # ]
|
79
|
+
def to_datetime(
|
80
|
+
format = nil,
|
81
|
+
time_unit: nil,
|
82
|
+
time_zone: nil,
|
83
|
+
strict: true,
|
84
|
+
exact: true,
|
85
|
+
cache: true,
|
86
|
+
ambiguous: "raise"
|
87
|
+
)
|
88
|
+
_validate_format_argument(format)
|
89
|
+
unless ambiguous.is_a?(Expr)
|
90
|
+
ambiguous = Polars.lit(ambiguous)
|
91
|
+
end
|
92
|
+
Utils.wrap_expr(
|
93
|
+
_rbexpr.str_to_datetime(
|
94
|
+
format,
|
95
|
+
time_unit,
|
96
|
+
time_zone,
|
97
|
+
strict,
|
98
|
+
exact,
|
99
|
+
cache,
|
100
|
+
ambiguous._rbexpr
|
101
|
+
)
|
102
|
+
)
|
103
|
+
end
|
104
|
+
|
105
|
+
# Convert a Utf8 column into a Time column.
|
106
|
+
#
|
107
|
+
# @param format [String]
|
108
|
+
# Format to use for conversion. Refer to the
|
109
|
+
# [chrono crate documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
|
110
|
+
# for the full specification. Example: `"%H:%M:%S"`.
|
111
|
+
# If set to nil (default), the format is inferred from the data.
|
112
|
+
# @param strict [Boolean]
|
113
|
+
# Raise an error if any conversion fails.
|
114
|
+
# @param cache [Boolean]
|
115
|
+
# Use a cache of unique, converted times to apply the conversion.
|
116
|
+
#
|
117
|
+
# @return [Expr]
|
118
|
+
#
|
119
|
+
# @example
|
120
|
+
# s = Polars::Series.new(["01:00", "02:00", "03:00"])
|
121
|
+
# s.str.to_time("%H:%M")
|
122
|
+
# # =>
|
123
|
+
# # shape: (3,)
|
124
|
+
# # Series: '' [time]
|
125
|
+
# # [
|
126
|
+
# # 01:00:00
|
127
|
+
# # 02:00:00
|
128
|
+
# # 03:00:00
|
129
|
+
# # ]
|
130
|
+
def to_time(format = nil, strict: true, cache: true)
|
131
|
+
_validate_format_argument(format)
|
132
|
+
Utils.wrap_expr(_rbexpr.str_to_time(format, strict, cache))
|
133
|
+
end
|
134
|
+
|
135
|
+
# Parse a Utf8 expression to a Date/Datetime/Time type.
|
136
|
+
#
|
137
|
+
# @param dtype [Object]
|
138
|
+
# The data type to convert into. Can be either Date, Datetime, or Time.
|
139
|
+
# @param format [String]
|
140
|
+
# Format to use, refer to the
|
141
|
+
# [chrono strftime documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
|
142
|
+
# for specification. Example: `"%y-%m-%d"`.
|
143
|
+
# @param strict [Boolean]
|
144
|
+
# Raise an error if any conversion fails.
|
145
|
+
# @param exact [Boolean]
|
146
|
+
# - If true, require an exact format match.
|
147
|
+
# - If false, allow the format to match anywhere in the target string.
|
148
|
+
# @param utc [Boolean]
|
149
|
+
# Parse timezone aware datetimes as UTC. This may be useful if you have data
|
150
|
+
# with mixed offsets.
|
151
|
+
#
|
152
|
+
# @return [Expr]
|
153
|
+
#
|
154
|
+
# @note
|
155
|
+
# When parsing a Datetime the column precision will be inferred from
|
156
|
+
# the format string, if given, eg: "%F %T%.3f" => Datetime("ms"). If
|
157
|
+
# no fractional second component is found then the default is "us".
|
158
|
+
#
|
159
|
+
# @example Dealing with a consistent format:
|
160
|
+
# s = Polars::Series.new(["2020-01-01 01:00Z", "2020-01-01 02:00Z"])
|
161
|
+
# s.str.strptime(Polars::Datetime, "%Y-%m-%d %H:%M%#z")
|
162
|
+
# # =>
|
163
|
+
# # shape: (2,)
|
164
|
+
# # Series: '' [datetime[μs, UTC]]
|
165
|
+
# # [
|
166
|
+
# # 2020-01-01 01:00:00 UTC
|
167
|
+
# # 2020-01-01 02:00:00 UTC
|
168
|
+
# # ]
|
169
|
+
#
|
170
|
+
# @example Dealing with different formats.
|
171
|
+
# s = Polars::Series.new(
|
172
|
+
# "date",
|
173
|
+
# [
|
174
|
+
# "2021-04-22",
|
175
|
+
# "2022-01-04 00:00:00",
|
176
|
+
# "01/31/22",
|
177
|
+
# "Sun Jul 8 00:34:60 2001",
|
178
|
+
# ]
|
179
|
+
# )
|
180
|
+
# s.to_frame.select(
|
181
|
+
# Polars.coalesce(
|
182
|
+
# Polars.col("date").str.strptime(Polars::Date, "%F", strict: false),
|
183
|
+
# Polars.col("date").str.strptime(Polars::Date, "%F %T", strict: false),
|
184
|
+
# Polars.col("date").str.strptime(Polars::Date, "%D", strict: false),
|
185
|
+
# Polars.col("date").str.strptime(Polars::Date, "%c", strict: false)
|
186
|
+
# )
|
187
|
+
# ).to_series
|
188
|
+
# # =>
|
189
|
+
# # shape: (4,)
|
190
|
+
# # Series: 'date' [date]
|
191
|
+
# # [
|
192
|
+
# # 2021-04-22
|
193
|
+
# # 2022-01-04
|
194
|
+
# # 2022-01-31
|
195
|
+
# # 2001-07-08
|
196
|
+
# # ]
|
197
|
+
def strptime(dtype, format = nil, strict: true, exact: true, cache: true, utc: false)
|
198
|
+
_validate_format_argument(format)
|
199
|
+
|
200
|
+
if dtype == Date
|
201
|
+
to_date(format, strict: strict, exact: exact, cache: cache)
|
202
|
+
elsif dtype == Datetime || dtype.is_a?(Datetime)
|
203
|
+
dtype = Datetime.new if dtype == Datetime
|
204
|
+
time_unit = dtype.time_unit
|
205
|
+
time_zone = dtype.time_zone
|
206
|
+
to_datetime(format, time_unit: time_unit, time_zone: time_zone, strict: strict, exact: exact, cache: cache)
|
207
|
+
elsif dtype == Time
|
208
|
+
to_time(format, strict: strict, cache: cache)
|
209
|
+
else
|
210
|
+
raise ArgumentError, "dtype should be of type {Date, Datetime, Time}"
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
# Convert a String column into a Decimal column.
|
215
|
+
#
|
216
|
+
# This method infers the needed parameters `precision` and `scale`.
|
217
|
+
#
|
218
|
+
# @param inference_length [Integer]
|
219
|
+
# Number of elements to parse to determine the `precision` and `scale`.
|
220
|
+
#
|
221
|
+
# @return [Expr]
|
222
|
+
#
|
223
|
+
# @example
|
224
|
+
# df = Polars::DataFrame.new(
|
225
|
+
# {
|
226
|
+
# "numbers": [
|
227
|
+
# "40.12",
|
228
|
+
# "3420.13",
|
229
|
+
# "120134.19",
|
230
|
+
# "3212.98",
|
231
|
+
# "12.90",
|
232
|
+
# "143.09",
|
233
|
+
# "143.9"
|
234
|
+
# ]
|
235
|
+
# }
|
236
|
+
# )
|
237
|
+
# df.with_columns(numbers_decimal: Polars.col("numbers").str.to_decimal)
|
238
|
+
# # =>
|
239
|
+
# # shape: (7, 2)
|
240
|
+
# # ┌───────────┬─────────────────┐
|
241
|
+
# # │ numbers ┆ numbers_decimal │
|
242
|
+
# # │ --- ┆ --- │
|
243
|
+
# # │ str ┆ decimal[*,2] │
|
244
|
+
# # ╞═══════════╪═════════════════╡
|
245
|
+
# # │ 40.12 ┆ 40.12 │
|
246
|
+
# # │ 3420.13 ┆ 3420.13 │
|
247
|
+
# # │ 120134.19 ┆ 120134.19 │
|
248
|
+
# # │ 3212.98 ┆ 3212.98 │
|
249
|
+
# # │ 12.90 ┆ 12.90 │
|
250
|
+
# # │ 143.09 ┆ 143.09 │
|
251
|
+
# # │ 143.9 ┆ 143.90 │
|
252
|
+
# # └───────────┴─────────────────┘
|
253
|
+
def to_decimal(inference_length = 100)
|
254
|
+
Utils.wrap_expr(_rbexpr.str_to_decimal(inference_length))
|
255
|
+
end
|
256
|
+
|
257
|
+
# Get length of the strings as `:u32` (as number of bytes).
|
258
|
+
#
|
259
|
+
# @return [Expr]
|
260
|
+
#
|
261
|
+
# @note
|
262
|
+
# The returned lengths are equal to the number of bytes in the UTF8 string. If you
|
263
|
+
# need the length in terms of the number of characters, use `n_chars` instead.
|
264
|
+
#
|
265
|
+
# @example
|
266
|
+
# df = Polars::DataFrame.new({"s" => ["Café", nil, "345", "東京"]}).with_columns(
|
267
|
+
# [
|
268
|
+
# Polars.col("s").str.len_bytes.alias("length"),
|
269
|
+
# Polars.col("s").str.len_chars.alias("nchars")
|
270
|
+
# ]
|
271
|
+
# )
|
272
|
+
# df
|
273
|
+
# # =>
|
274
|
+
# # shape: (4, 3)
|
275
|
+
# # ┌──────┬────────┬────────┐
|
276
|
+
# # │ s ┆ length ┆ nchars │
|
277
|
+
# # │ --- ┆ --- ┆ --- │
|
278
|
+
# # │ str ┆ u32 ┆ u32 │
|
279
|
+
# # ╞══════╪════════╪════════╡
|
280
|
+
# # │ Café ┆ 5 ┆ 4 │
|
281
|
+
# # │ null ┆ null ┆ null │
|
282
|
+
# # │ 345 ┆ 3 ┆ 3 │
|
283
|
+
# # │ 東京 ┆ 6 ┆ 2 │
|
284
|
+
# # └──────┴────────┴────────┘
|
285
|
+
def len_bytes
|
286
|
+
Utils.wrap_expr(_rbexpr.str_len_bytes)
|
287
|
+
end
|
288
|
+
alias_method :lengths, :len_bytes
|
289
|
+
|
290
|
+
# Get length of the strings as `:u32` (as number of chars).
|
291
|
+
#
|
292
|
+
# @return [Expr]
|
293
|
+
#
|
294
|
+
# @note
|
295
|
+
# If you know that you are working with ASCII text, `lengths` will be
|
296
|
+
# equivalent, and faster (returns length in terms of the number of bytes).
|
297
|
+
#
|
298
|
+
# @example
|
299
|
+
# df = Polars::DataFrame.new({"s" => ["Café", nil, "345", "東京"]}).with_columns(
|
300
|
+
# [
|
301
|
+
# Polars.col("s").str.len_bytes.alias("length"),
|
302
|
+
# Polars.col("s").str.len_chars.alias("nchars")
|
303
|
+
# ]
|
304
|
+
# )
|
305
|
+
# df
|
306
|
+
# # =>
|
307
|
+
# # shape: (4, 3)
|
308
|
+
# # ┌──────┬────────┬────────┐
|
309
|
+
# # │ s ┆ length ┆ nchars │
|
310
|
+
# # │ --- ┆ --- ┆ --- │
|
311
|
+
# # │ str ┆ u32 ┆ u32 │
|
312
|
+
# # ╞══════╪════════╪════════╡
|
313
|
+
# # │ Café ┆ 5 ┆ 4 │
|
314
|
+
# # │ null ┆ null ┆ null │
|
315
|
+
# # │ 345 ┆ 3 ┆ 3 │
|
316
|
+
# # │ 東京 ┆ 6 ┆ 2 │
|
317
|
+
# # └──────┴────────┴────────┘
|
318
|
+
def len_chars
|
319
|
+
Utils.wrap_expr(_rbexpr.str_len_chars)
|
320
|
+
end
|
321
|
+
alias_method :n_chars, :len_chars
|
322
|
+
|
323
|
+
# Vertically concat the values in the Series to a single string value.
|
324
|
+
#
|
325
|
+
# @param delimiter [String]
|
326
|
+
# The delimiter to insert between consecutive string values.
|
327
|
+
# @param ignore_nulls [Boolean]
|
328
|
+
# Ignore null values (default).
|
329
|
+
#
|
330
|
+
# @return [Expr]
|
331
|
+
#
|
332
|
+
# @example
|
333
|
+
# df = Polars::DataFrame.new({"foo" => [1, nil, 2]})
|
334
|
+
# df.select(Polars.col("foo").str.join("-"))
|
335
|
+
# # =>
|
336
|
+
# # shape: (1, 1)
|
337
|
+
# # ┌─────┐
|
338
|
+
# # │ foo │
|
339
|
+
# # │ --- │
|
340
|
+
# # │ str │
|
341
|
+
# # ╞═════╡
|
342
|
+
# # │ 1-2 │
|
343
|
+
# # └─────┘
|
344
|
+
#
|
345
|
+
# @example
|
346
|
+
# df = Polars::DataFrame.new({"foo" => [1, nil, 2]})
|
347
|
+
# df.select(Polars.col("foo").str.join("-", ignore_nulls: false))
|
348
|
+
# # =>
|
349
|
+
# # shape: (1, 1)
|
350
|
+
# # ┌──────┐
|
351
|
+
# # │ foo │
|
352
|
+
# # │ --- │
|
353
|
+
# # │ str │
|
354
|
+
# # ╞══════╡
|
355
|
+
# # │ null │
|
356
|
+
# # └──────┘
|
357
|
+
def join(delimiter = "-", ignore_nulls: true)
|
358
|
+
Utils.wrap_expr(_rbexpr.str_join(delimiter, ignore_nulls))
|
359
|
+
end
|
360
|
+
alias_method :concat, :join
|
361
|
+
|
362
|
+
# Transform to uppercase variant.
|
363
|
+
#
|
364
|
+
# @return [Expr]
|
365
|
+
#
|
366
|
+
# @example
|
367
|
+
# df = Polars::DataFrame.new({"foo" => ["cat", "dog"]})
|
368
|
+
# df.select(Polars.col("foo").str.to_uppercase)
|
369
|
+
# # =>
|
370
|
+
# # shape: (2, 1)
|
371
|
+
# # ┌─────┐
|
372
|
+
# # │ foo │
|
373
|
+
# # │ --- │
|
374
|
+
# # │ str │
|
375
|
+
# # ╞═════╡
|
376
|
+
# # │ CAT │
|
377
|
+
# # │ DOG │
|
378
|
+
# # └─────┘
|
379
|
+
def to_uppercase
|
380
|
+
Utils.wrap_expr(_rbexpr.str_to_uppercase)
|
381
|
+
end
|
382
|
+
|
383
|
+
# Transform to lowercase variant.
|
384
|
+
#
|
385
|
+
# @return [Expr]
|
386
|
+
#
|
387
|
+
# @example
|
388
|
+
# df = Polars::DataFrame.new({"foo" => ["CAT", "DOG"]})
|
389
|
+
# df.select(Polars.col("foo").str.to_lowercase)
|
390
|
+
# # =>
|
391
|
+
# # shape: (2, 1)
|
392
|
+
# # ┌─────┐
|
393
|
+
# # │ foo │
|
394
|
+
# # │ --- │
|
395
|
+
# # │ str │
|
396
|
+
# # ╞═════╡
|
397
|
+
# # │ cat │
|
398
|
+
# # │ dog │
|
399
|
+
# # └─────┘
|
400
|
+
def to_lowercase
|
401
|
+
Utils.wrap_expr(_rbexpr.str_to_lowercase)
|
402
|
+
end
|
403
|
+
|
404
|
+
# Transform to titlecase variant.
|
405
|
+
#
|
406
|
+
# @return [Expr]
|
407
|
+
#
|
408
|
+
# @example
|
409
|
+
# df = Polars::DataFrame.new(
|
410
|
+
# {"sing": ["welcome to my world", "THERE'S NO TURNING BACK"]}
|
411
|
+
# )
|
412
|
+
# df.with_columns(foo_title: Polars.col("sing").str.to_titlecase)
|
413
|
+
# # =>
|
414
|
+
# # shape: (2, 2)
|
415
|
+
# # ┌─────────────────────────┬─────────────────────────┐
|
416
|
+
# # │ sing ┆ foo_title │
|
417
|
+
# # │ --- ┆ --- │
|
418
|
+
# # │ str ┆ str │
|
419
|
+
# # ╞═════════════════════════╪═════════════════════════╡
|
420
|
+
# # │ welcome to my world ┆ Welcome To My World │
|
421
|
+
# # │ THERE'S NO TURNING BACK ┆ There's No Turning Back │
|
422
|
+
# # └─────────────────────────┴─────────────────────────┘
|
423
|
+
def to_titlecase
|
424
|
+
raise Todo
|
425
|
+
Utils.wrap_expr(_rbexpr.str_to_titlecase)
|
426
|
+
end
|
427
|
+
|
428
|
+
# Remove leading and trailing whitespace.
|
429
|
+
#
|
430
|
+
# @param characters [String, nil]
|
431
|
+
# An optional single character that should be trimmed.
|
432
|
+
#
|
433
|
+
# @return [Expr]
|
434
|
+
#
|
435
|
+
# @example
|
436
|
+
# df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
|
437
|
+
# df.select(Polars.col("foo").str.strip)
|
438
|
+
# # =>
|
439
|
+
# # shape: (3, 1)
|
440
|
+
# # ┌───────┐
|
441
|
+
# # │ foo │
|
442
|
+
# # │ --- │
|
443
|
+
# # │ str │
|
444
|
+
# # ╞═══════╡
|
445
|
+
# # │ lead │
|
446
|
+
# # │ trail │
|
447
|
+
# # │ both │
|
448
|
+
# # └───────┘
|
449
|
+
def strip_chars(characters = nil)
|
450
|
+
characters = Utils.parse_into_expression(characters, str_as_lit: true)
|
451
|
+
Utils.wrap_expr(_rbexpr.str_strip_chars(characters))
|
452
|
+
end
|
453
|
+
alias_method :strip, :strip_chars
|
454
|
+
|
455
|
+
# Remove leading whitespace.
|
456
|
+
#
|
457
|
+
# @param characters [String, nil]
|
458
|
+
# An optional single character that should be trimmed.
|
459
|
+
#
|
460
|
+
# @return [Expr]
|
461
|
+
#
|
462
|
+
# @example
|
463
|
+
# df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
|
464
|
+
# df.select(Polars.col("foo").str.lstrip)
|
465
|
+
# # =>
|
466
|
+
# # shape: (3, 1)
|
467
|
+
# # ┌────────┐
|
468
|
+
# # │ foo │
|
469
|
+
# # │ --- │
|
470
|
+
# # │ str │
|
471
|
+
# # ╞════════╡
|
472
|
+
# # │ lead │
|
473
|
+
# # │ trail │
|
474
|
+
# # │ both │
|
475
|
+
# # └────────┘
|
476
|
+
def strip_chars_start(characters = nil)
|
477
|
+
characters = Utils.parse_into_expression(characters, str_as_lit: true)
|
478
|
+
Utils.wrap_expr(_rbexpr.str_strip_chars_start(characters))
|
479
|
+
end
|
480
|
+
alias_method :lstrip, :strip_chars_start
|
481
|
+
|
482
|
+
# Remove trailing whitespace.
|
483
|
+
#
|
484
|
+
# @param characters [String, nil]
|
485
|
+
# An optional single character that should be trimmed.
|
486
|
+
#
|
487
|
+
# @return [Expr]
|
488
|
+
#
|
489
|
+
# @example
|
490
|
+
# df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
|
491
|
+
# df.select(Polars.col("foo").str.rstrip)
|
492
|
+
# # =>
|
493
|
+
# # shape: (3, 1)
|
494
|
+
# # ┌───────┐
|
495
|
+
# # │ foo │
|
496
|
+
# # │ --- │
|
497
|
+
# # │ str │
|
498
|
+
# # ╞═══════╡
|
499
|
+
# # │ lead │
|
500
|
+
# # │ trail │
|
501
|
+
# # │ both │
|
502
|
+
# # └───────┘
|
503
|
+
def strip_chars_end(characters = nil)
|
504
|
+
characters = Utils.parse_into_expression(characters, str_as_lit: true)
|
505
|
+
Utils.wrap_expr(_rbexpr.str_strip_chars_end(characters))
|
506
|
+
end
|
507
|
+
alias_method :rstrip, :strip_chars_end
|
508
|
+
|
509
|
+
# Remove prefix.
|
510
|
+
#
|
511
|
+
# The prefix will be removed from the string exactly once, if found.
|
512
|
+
#
|
513
|
+
# @param prefix [String]
|
514
|
+
# The prefix to be removed.
|
515
|
+
#
|
516
|
+
# @return [Expr]
|
517
|
+
#
|
518
|
+
# @example
|
519
|
+
# df = Polars::DataFrame.new({"a" => ["foobar", "foofoobar", "foo", "bar"]})
|
520
|
+
# df.with_columns(Polars.col("a").str.strip_prefix("foo").alias("stripped"))
|
521
|
+
# # =>
|
522
|
+
# # shape: (4, 2)
|
523
|
+
# # ┌───────────┬──────────┐
|
524
|
+
# # │ a ┆ stripped │
|
525
|
+
# # │ --- ┆ --- │
|
526
|
+
# # │ str ┆ str │
|
527
|
+
# # ╞═══════════╪══════════╡
|
528
|
+
# # │ foobar ┆ bar │
|
529
|
+
# # │ foofoobar ┆ foobar │
|
530
|
+
# # │ foo ┆ │
|
531
|
+
# # │ bar ┆ bar │
|
532
|
+
# # └───────────┴──────────┘
|
533
|
+
def strip_prefix(prefix)
|
534
|
+
prefix = Utils.parse_into_expression(prefix, str_as_lit: true)
|
535
|
+
Utils.wrap_expr(_rbexpr.str_strip_prefix(prefix))
|
536
|
+
end
|
537
|
+
|
538
|
+
# Remove suffix.
|
539
|
+
#
|
540
|
+
# The suffix will be removed from the string exactly once, if found.
|
541
|
+
#
|
542
|
+
#
|
543
|
+
# @param suffix [String]
|
544
|
+
# The suffix to be removed.
|
545
|
+
#
|
546
|
+
# @return [Expr]
|
547
|
+
#
|
548
|
+
# @example
|
549
|
+
# df = Polars::DataFrame.new({"a" => ["foobar", "foobarbar", "foo", "bar"]})
|
550
|
+
# df.with_columns(Polars.col("a").str.strip_suffix("bar").alias("stripped"))
|
551
|
+
# # =>
|
552
|
+
# # shape: (4, 2)
|
553
|
+
# # ┌───────────┬──────────┐
|
554
|
+
# # │ a ┆ stripped │
|
555
|
+
# # │ --- ┆ --- │
|
556
|
+
# # │ str ┆ str │
|
557
|
+
# # ╞═══════════╪══════════╡
|
558
|
+
# # │ foobar ┆ foo │
|
559
|
+
# # │ foobarbar ┆ foobar │
|
560
|
+
# # │ foo ┆ foo │
|
561
|
+
# # │ bar ┆ │
|
562
|
+
# # └───────────┴──────────┘
|
563
|
+
def strip_suffix(suffix)
|
564
|
+
suffix = Utils.parse_into_expression(suffix, str_as_lit: true)
|
565
|
+
Utils.wrap_expr(_rbexpr.str_strip_suffix(suffix))
|
566
|
+
end
|
567
|
+
|
568
|
+
# Pad the start of the string until it reaches the given length.
|
569
|
+
#
|
570
|
+
# @param length [Integer]
|
571
|
+
# Pad the string until it reaches this length. Strings with length equal to
|
572
|
+
# or greater than this value are returned as-is.
|
573
|
+
# @param fill_char [String]
|
574
|
+
# The character to pad the string with.
|
575
|
+
#
|
576
|
+
# @return [Expr]
|
577
|
+
#
|
578
|
+
# @example
|
579
|
+
# df = Polars::DataFrame.new({"a": ["cow", "monkey", "hippopotamus", nil]})
|
580
|
+
# df.with_columns(padded: Polars.col("a").str.pad_start(8, "*"))
|
581
|
+
# # =>
|
582
|
+
# # shape: (4, 2)
|
583
|
+
# # ┌──────────────┬──────────────┐
|
584
|
+
# # │ a ┆ padded │
|
585
|
+
# # │ --- ┆ --- │
|
586
|
+
# # │ str ┆ str │
|
587
|
+
# # ╞══════════════╪══════════════╡
|
588
|
+
# # │ cow ┆ *****cow │
|
589
|
+
# # │ monkey ┆ **monkey │
|
590
|
+
# # │ hippopotamus ┆ hippopotamus │
|
591
|
+
# # │ null ┆ null │
|
592
|
+
# # └──────────────┴──────────────┘
|
593
|
+
def pad_start(length, fill_char = " ")
|
594
|
+
Utils.wrap_expr(_rbexpr.str_pad_start(length, fill_char))
|
595
|
+
end
|
596
|
+
alias_method :rjust, :pad_start
|
597
|
+
|
598
|
+
# Pad the end of the string until it reaches the given length.
|
599
|
+
#
|
600
|
+
# @param length [Integer]
|
601
|
+
# Pad the string until it reaches this length. Strings with length equal to
|
602
|
+
# or greater than this value are returned as-is.
|
603
|
+
# @param fill_char [String]
|
604
|
+
# The character to pad the string with.
|
605
|
+
#
|
606
|
+
# @return [Expr]
|
607
|
+
#
|
608
|
+
# @example
|
609
|
+
# df = Polars::DataFrame.new({"a": ["cow", "monkey", "hippopotamus", nil]})
|
610
|
+
# df.with_columns(padded: Polars.col("a").str.pad_end(8, "*"))
|
611
|
+
# # =>
|
612
|
+
# # shape: (4, 2)
|
613
|
+
# # ┌──────────────┬──────────────┐
|
614
|
+
# # │ a ┆ padded │
|
615
|
+
# # │ --- ┆ --- │
|
616
|
+
# # │ str ┆ str │
|
617
|
+
# # ╞══════════════╪══════════════╡
|
618
|
+
# # │ cow ┆ cow***** │
|
619
|
+
# # │ monkey ┆ monkey** │
|
620
|
+
# # │ hippopotamus ┆ hippopotamus │
|
621
|
+
# # │ null ┆ null │
|
622
|
+
# # └──────────────┴──────────────┘
|
623
|
+
def pad_end(length, fill_char = " ")
|
624
|
+
Utils.wrap_expr(_rbexpr.str_pad_end(length, fill_char))
|
625
|
+
end
|
626
|
+
alias_method :ljust, :pad_end
|
627
|
+
|
628
|
+
# Fills the string with zeroes.
|
629
|
+
#
|
630
|
+
# Return a copy of the string left filled with ASCII '0' digits to make a string
|
631
|
+
# of length width.
|
632
|
+
#
|
633
|
+
# A leading sign prefix ('+'/'-') is handled by inserting the padding after the
|
634
|
+
# sign character rather than before. The original string is returned if width is
|
635
|
+
# less than or equal to `s.length`.
|
636
|
+
#
|
637
|
+
# @param length [Integer]
|
638
|
+
# Fill the value up to this length
|
639
|
+
#
|
640
|
+
# @return [Expr]
|
641
|
+
#
|
642
|
+
# @example
|
643
|
+
# df = Polars::DataFrame.new({"a" => [-1, 123, 999999, nil]})
|
644
|
+
# df.with_columns(Polars.col("a").cast(Polars::String).str.zfill(4).alias("zfill"))
|
645
|
+
# # =>
|
646
|
+
# # shape: (4, 2)
|
647
|
+
# # ┌────────┬────────┐
|
648
|
+
# # │ a ┆ zfill │
|
649
|
+
# # │ --- ┆ --- │
|
650
|
+
# # │ i64 ┆ str │
|
651
|
+
# # ╞════════╪════════╡
|
652
|
+
# # │ -1 ┆ -001 │
|
653
|
+
# # │ 123 ┆ 0123 │
|
654
|
+
# # │ 999999 ┆ 999999 │
|
655
|
+
# # │ null ┆ null │
|
656
|
+
# # └────────┴────────┘
|
657
|
+
def zfill(length)
|
658
|
+
length = Utils.parse_into_expression(length)
|
659
|
+
Utils.wrap_expr(_rbexpr.str_zfill(length))
|
660
|
+
end
|
661
|
+
|
662
|
+
# Check if string contains a substring that matches a regex.
|
663
|
+
#
|
664
|
+
# @param pattern [String]
|
665
|
+
# A valid regex pattern.
|
666
|
+
# @param literal [Boolean]
|
667
|
+
# Treat pattern as a literal string.
|
668
|
+
#
|
669
|
+
# @return [Expr]
|
670
|
+
#
|
671
|
+
# @example
|
672
|
+
# df = Polars::DataFrame.new({"a" => ["Crab", "cat and dog", "rab$bit", nil]})
|
673
|
+
# df.select(
|
674
|
+
# [
|
675
|
+
# Polars.col("a"),
|
676
|
+
# Polars.col("a").str.contains("cat|bit").alias("regex"),
|
677
|
+
# Polars.col("a").str.contains("rab$", literal: true).alias("literal")
|
678
|
+
# ]
|
679
|
+
# )
|
680
|
+
# # =>
|
681
|
+
# # shape: (4, 3)
|
682
|
+
# # ┌─────────────┬───────┬─────────┐
|
683
|
+
# # │ a ┆ regex ┆ literal │
|
684
|
+
# # │ --- ┆ --- ┆ --- │
|
685
|
+
# # │ str ┆ bool ┆ bool │
|
686
|
+
# # ╞═════════════╪═══════╪═════════╡
|
687
|
+
# # │ Crab ┆ false ┆ false │
|
688
|
+
# # │ cat and dog ┆ true ┆ false │
|
689
|
+
# # │ rab$bit ┆ true ┆ true │
|
690
|
+
# # │ null ┆ null ┆ null │
|
691
|
+
# # └─────────────┴───────┴─────────┘
|
692
|
+
def contains(pattern, literal: false, strict: true)
|
693
|
+
pattern = Utils.parse_into_expression(pattern, str_as_lit: true)
|
694
|
+
Utils.wrap_expr(_rbexpr.str_contains(pattern, literal, strict))
|
695
|
+
end
|
696
|
+
|
697
|
+
# Check if string values end with a substring.
|
698
|
+
#
|
699
|
+
# @param sub [String]
|
700
|
+
# Suffix substring.
|
701
|
+
#
|
702
|
+
# @return [Expr]
|
703
|
+
#
|
704
|
+
# @example
|
705
|
+
# df = Polars::DataFrame.new({"fruits" => ["apple", "mango", nil]})
|
706
|
+
# df.with_column(
|
707
|
+
# Polars.col("fruits").str.ends_with("go").alias("has_suffix")
|
708
|
+
# )
|
709
|
+
# # =>
|
710
|
+
# # shape: (3, 2)
|
711
|
+
# # ┌────────┬────────────┐
|
712
|
+
# # │ fruits ┆ has_suffix │
|
713
|
+
# # │ --- ┆ --- │
|
714
|
+
# # │ str ┆ bool │
|
715
|
+
# # ╞════════╪════════════╡
|
716
|
+
# # │ apple ┆ false │
|
717
|
+
# # │ mango ┆ true │
|
718
|
+
# # │ null ┆ null │
|
719
|
+
# # └────────┴────────────┘
|
720
|
+
#
|
721
|
+
# @example Using `ends_with` as a filter condition:
|
722
|
+
# df.filter(Polars.col("fruits").str.ends_with("go"))
|
723
|
+
# # =>
|
724
|
+
# # shape: (1, 1)
|
725
|
+
# # ┌────────┐
|
726
|
+
# # │ fruits │
|
727
|
+
# # │ --- │
|
728
|
+
# # │ str │
|
729
|
+
# # ╞════════╡
|
730
|
+
# # │ mango │
|
731
|
+
# # └────────┘
|
732
|
+
def ends_with(sub)
|
733
|
+
sub = Utils.parse_into_expression(sub, str_as_lit: true)
|
734
|
+
Utils.wrap_expr(_rbexpr.str_ends_with(sub))
|
735
|
+
end
|
736
|
+
|
737
|
+
# Check if string values start with a substring.
|
738
|
+
#
|
739
|
+
# @param sub [String]
|
740
|
+
# Prefix substring.
|
741
|
+
#
|
742
|
+
# @return [Expr]
|
743
|
+
#
|
744
|
+
# @example
|
745
|
+
# df = Polars::DataFrame.new({"fruits" => ["apple", "mango", nil]})
|
746
|
+
# df.with_column(
|
747
|
+
# Polars.col("fruits").str.starts_with("app").alias("has_prefix")
|
748
|
+
# )
|
749
|
+
# # =>
|
750
|
+
# # shape: (3, 2)
|
751
|
+
# # ┌────────┬────────────┐
|
752
|
+
# # │ fruits ┆ has_prefix │
|
753
|
+
# # │ --- ┆ --- │
|
754
|
+
# # │ str ┆ bool │
|
755
|
+
# # ╞════════╪════════════╡
|
756
|
+
# # │ apple ┆ true │
|
757
|
+
# # │ mango ┆ false │
|
758
|
+
# # │ null ┆ null │
|
759
|
+
# # └────────┴────────────┘
|
760
|
+
#
|
761
|
+
# @example Using `starts_with` as a filter condition:
|
762
|
+
# df.filter(Polars.col("fruits").str.starts_with("app"))
|
763
|
+
# # =>
|
764
|
+
# # shape: (1, 1)
|
765
|
+
# # ┌────────┐
|
766
|
+
# # │ fruits │
|
767
|
+
# # │ --- │
|
768
|
+
# # │ str │
|
769
|
+
# # ╞════════╡
|
770
|
+
# # │ apple │
|
771
|
+
# # └────────┘
|
772
|
+
def starts_with(sub)
|
773
|
+
sub = Utils.parse_into_expression(sub, str_as_lit: true)
|
774
|
+
Utils.wrap_expr(_rbexpr.str_starts_with(sub))
|
775
|
+
end
|
776
|
+
|
777
|
+
# Parse string values as JSON.
|
778
|
+
#
|
779
|
+
# Throw errors if encounter invalid JSON strings.
|
780
|
+
#
|
781
|
+
# @param dtype [Object]
|
782
|
+
# The dtype to cast the extracted value to. If nil, the dtype will be
|
783
|
+
# inferred from the JSON value.
|
784
|
+
#
|
785
|
+
# @return [Expr]
|
786
|
+
#
|
787
|
+
# @example
|
788
|
+
# df = Polars::DataFrame.new(
|
789
|
+
# {"json" => ['{"a":1, "b": true}', nil, '{"a":2, "b": false}']}
|
790
|
+
# )
|
791
|
+
# dtype = Polars::Struct.new([Polars::Field.new("a", Polars::Int64), Polars::Field.new("b", Polars::Boolean)])
|
792
|
+
# df.select(Polars.col("json").str.json_decode(dtype))
|
793
|
+
# # =>
|
794
|
+
# # shape: (3, 1)
|
795
|
+
# # ┌───────────┐
|
796
|
+
# # │ json │
|
797
|
+
# # │ --- │
|
798
|
+
# # │ struct[2] │
|
799
|
+
# # ╞═══════════╡
|
800
|
+
# # │ {1,true} │
|
801
|
+
# # │ null │
|
802
|
+
# # │ {2,false} │
|
803
|
+
# # └───────────┘
|
804
|
+
def json_decode(dtype = nil, infer_schema_length: 100)
|
805
|
+
if !dtype.nil?
|
806
|
+
dtype = Utils.rb_type_to_dtype(dtype)
|
807
|
+
end
|
808
|
+
Utils.wrap_expr(_rbexpr.str_json_decode(dtype, infer_schema_length))
|
809
|
+
end
|
810
|
+
alias_method :json_extract, :json_decode
|
811
|
+
|
812
|
+
# Extract the first match of json string with provided JSONPath expression.
|
813
|
+
#
|
814
|
+
# Throw errors if encounter invalid json strings.
|
815
|
+
# All return value will be casted to Utf8 regardless of the original value.
|
816
|
+
#
|
817
|
+
# Documentation on JSONPath standard can be found
|
818
|
+
# [here](https://goessner.net/articles/JsonPath/).
|
819
|
+
#
|
820
|
+
# @param json_path [String]
|
821
|
+
# A valid JSON path query string.
|
822
|
+
#
|
823
|
+
# @return [Expr]
|
824
|
+
#
|
825
|
+
# @example
|
826
|
+
# df = Polars::DataFrame.new(
|
827
|
+
# {"json_val" => ['{"a":"1"}', nil, '{"a":2}', '{"a":2.1}', '{"a":true}']}
|
828
|
+
# )
|
829
|
+
# df.select(Polars.col("json_val").str.json_path_match("$.a"))
|
830
|
+
# # =>
|
831
|
+
# # shape: (5, 1)
|
832
|
+
# # ┌──────────┐
|
833
|
+
# # │ json_val │
|
834
|
+
# # │ --- │
|
835
|
+
# # │ str │
|
836
|
+
# # ╞══════════╡
|
837
|
+
# # │ 1 │
|
838
|
+
# # │ null │
|
839
|
+
# # │ 2 │
|
840
|
+
# # │ 2.1 │
|
841
|
+
# # │ true │
|
842
|
+
# # └──────────┘
|
843
|
+
def json_path_match(json_path)
|
844
|
+
json_path = Utils.parse_into_expression(json_path, str_as_lit: true)
|
845
|
+
Utils.wrap_expr(_rbexpr.str_json_path_match(json_path))
|
846
|
+
end
|
847
|
+
|
848
|
+
# Decode a value using the provided encoding.
|
849
|
+
#
|
850
|
+
# @param encoding ["hex", "base64"]
|
851
|
+
# The encoding to use.
|
852
|
+
# @param strict [Boolean]
|
853
|
+
# How to handle invalid inputs:
|
854
|
+
#
|
855
|
+
# - `true`: An error will be thrown if unable to decode a value.
|
856
|
+
# - `false`: Unhandled values will be replaced with `nil`.
|
857
|
+
#
|
858
|
+
# @return [Expr]
|
859
|
+
#
|
860
|
+
# @example
|
861
|
+
# df = Polars::DataFrame.new({"encoded" => ["666f6f", "626172", nil]})
|
862
|
+
# df.select(Polars.col("encoded").str.decode("hex"))
|
863
|
+
# # =>
|
864
|
+
# # shape: (3, 1)
|
865
|
+
# # ┌─────────┐
|
866
|
+
# # │ encoded │
|
867
|
+
# # │ --- │
|
868
|
+
# # │ binary │
|
869
|
+
# # ╞═════════╡
|
870
|
+
# # │ b"foo" │
|
871
|
+
# # │ b"bar" │
|
872
|
+
# # │ null │
|
873
|
+
# # └─────────┘
|
874
|
+
def decode(encoding, strict: true)
|
875
|
+
if encoding == "hex"
|
876
|
+
Utils.wrap_expr(_rbexpr.str_hex_decode(strict))
|
877
|
+
elsif encoding == "base64"
|
878
|
+
Utils.wrap_expr(_rbexpr.str_base64_decode(strict))
|
879
|
+
else
|
880
|
+
raise ArgumentError, "encoding must be one of {{'hex', 'base64'}}, got #{encoding}"
|
881
|
+
end
|
882
|
+
end
|
883
|
+
|
884
|
+
# Encode a value using the provided encoding.
|
885
|
+
#
|
886
|
+
# @param encoding ["hex", "base64"]
|
887
|
+
# The encoding to use.
|
888
|
+
#
|
889
|
+
# @return [Expr]
|
890
|
+
#
|
891
|
+
# @example
|
892
|
+
# df = Polars::DataFrame.new({"strings" => ["foo", "bar", nil]})
|
893
|
+
# df.select(Polars.col("strings").str.encode("hex"))
|
894
|
+
# # =>
|
895
|
+
# # shape: (3, 1)
|
896
|
+
# # ┌─────────┐
|
897
|
+
# # │ strings │
|
898
|
+
# # │ --- │
|
899
|
+
# # │ str │
|
900
|
+
# # ╞═════════╡
|
901
|
+
# # │ 666f6f │
|
902
|
+
# # │ 626172 │
|
903
|
+
# # │ null │
|
904
|
+
# # └─────────┘
|
905
|
+
def encode(encoding)
|
906
|
+
if encoding == "hex"
|
907
|
+
Utils.wrap_expr(_rbexpr.str_hex_encode)
|
908
|
+
elsif encoding == "base64"
|
909
|
+
Utils.wrap_expr(_rbexpr.str_base64_encode)
|
910
|
+
else
|
911
|
+
raise ArgumentError, "encoding must be one of {{'hex', 'base64'}}, got #{encoding}"
|
912
|
+
end
|
913
|
+
end
|
914
|
+
|
915
|
+
# Extract the target capture group from provided patterns.
|
916
|
+
#
|
917
|
+
# @param pattern [String]
|
918
|
+
# A valid regex pattern
|
919
|
+
# @param group_index [Integer]
|
920
|
+
# Index of the targeted capture group.
|
921
|
+
# Group 0 mean the whole pattern, first group begin at index 1
|
922
|
+
# Default to the first capture group
|
923
|
+
#
|
924
|
+
# @return [Expr]
|
925
|
+
#
|
926
|
+
# @example
|
927
|
+
# df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
|
928
|
+
# df.select(
|
929
|
+
# [
|
930
|
+
# Polars.col("foo").str.extract('(\d+)')
|
931
|
+
# ]
|
932
|
+
# )
|
933
|
+
# # =>
|
934
|
+
# # shape: (2, 1)
|
935
|
+
# # ┌─────┐
|
936
|
+
# # │ foo │
|
937
|
+
# # │ --- │
|
938
|
+
# # │ str │
|
939
|
+
# # ╞═════╡
|
940
|
+
# # │ 123 │
|
941
|
+
# # │ 678 │
|
942
|
+
# # └─────┘
|
943
|
+
def extract(pattern, group_index: 1)
|
944
|
+
pattern = Utils.parse_into_expression(pattern, str_as_lit: true)
|
945
|
+
Utils.wrap_expr(_rbexpr.str_extract(pattern, group_index))
|
946
|
+
end
|
947
|
+
|
948
|
+
# Extracts all matches for the given regex pattern.
|
949
|
+
#
|
950
|
+
# Extracts each successive non-overlapping regex match in an individual string as
|
951
|
+
# an array.
|
952
|
+
#
|
953
|
+
# @param pattern [String]
|
954
|
+
# A valid regex pattern
|
955
|
+
#
|
956
|
+
# @return [Expr]
|
957
|
+
#
|
958
|
+
# @example
|
959
|
+
# df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
|
960
|
+
# df.select(
|
961
|
+
# [
|
962
|
+
# Polars.col("foo").str.extract_all('(\d+)').alias("extracted_nrs")
|
963
|
+
# ]
|
964
|
+
# )
|
965
|
+
# # =>
|
966
|
+
# # shape: (2, 1)
|
967
|
+
# # ┌────────────────┐
|
968
|
+
# # │ extracted_nrs │
|
969
|
+
# # │ --- │
|
970
|
+
# # │ list[str] │
|
971
|
+
# # ╞════════════════╡
|
972
|
+
# # │ ["123", "45"] │
|
973
|
+
# # │ ["678", "910"] │
|
974
|
+
# # └────────────────┘
|
975
|
+
def extract_all(pattern)
|
976
|
+
pattern = Utils.parse_into_expression(pattern, str_as_lit: true)
|
977
|
+
Utils.wrap_expr(_rbexpr.str_extract_all(pattern))
|
978
|
+
end
|
979
|
+
|
980
|
+
# Extract all capture groups for the given regex pattern.
|
981
|
+
#
|
982
|
+
# @param pattern [String]
|
983
|
+
# A valid regular expression pattern containing at least one capture group,
|
984
|
+
# compatible with the [regex crate](https://docs.rs/regex/latest/regex/).
|
985
|
+
#
|
986
|
+
# @return [Expr]
|
987
|
+
#
|
988
|
+
# @example
|
989
|
+
# df = Polars::DataFrame.new(
|
990
|
+
# {
|
991
|
+
# "url": [
|
992
|
+
# "http://vote.com/ballon_dor?candidate=messi&ref=python",
|
993
|
+
# "http://vote.com/ballon_dor?candidate=weghorst&ref=polars",
|
994
|
+
# "http://vote.com/ballon_dor?error=404&ref=rust"
|
995
|
+
# ]
|
996
|
+
# }
|
997
|
+
# )
|
998
|
+
# pattern = /candidate=(?<candidate>\w+)&ref=(?<ref>\w+)/.to_s
|
999
|
+
# df.select(captures: Polars.col("url").str.extract_groups(pattern)).unnest(
|
1000
|
+
# "captures"
|
1001
|
+
# )
|
1002
|
+
# # =>
|
1003
|
+
# # shape: (3, 2)
|
1004
|
+
# # ┌───────────┬────────┐
|
1005
|
+
# # │ candidate ┆ ref │
|
1006
|
+
# # │ --- ┆ --- │
|
1007
|
+
# # │ str ┆ str │
|
1008
|
+
# # ╞═══════════╪════════╡
|
1009
|
+
# # │ messi ┆ python │
|
1010
|
+
# # │ weghorst ┆ polars │
|
1011
|
+
# # │ null ┆ null │
|
1012
|
+
# # └───────────┴────────┘
|
1013
|
+
#
|
1014
|
+
# @example Unnamed groups have their numerical position converted to a string:
|
1015
|
+
# pattern = /candidate=(\w+)&ref=(\w+)/.to_s
|
1016
|
+
# (
|
1017
|
+
# df.with_columns(
|
1018
|
+
# captures: Polars.col("url").str.extract_groups(pattern)
|
1019
|
+
# ).with_columns(name: Polars.col("captures").struct["1"].str.to_uppercase)
|
1020
|
+
# )
|
1021
|
+
# # =>
|
1022
|
+
# # shape: (3, 3)
|
1023
|
+
# # ┌─────────────────────────────────┬───────────────────────┬──────────┐
|
1024
|
+
# # │ url ┆ captures ┆ name │
|
1025
|
+
# # │ --- ┆ --- ┆ --- │
|
1026
|
+
# # │ str ┆ struct[2] ┆ str │
|
1027
|
+
# # ╞═════════════════════════════════╪═══════════════════════╪══════════╡
|
1028
|
+
# # │ http://vote.com/ballon_dor?can… ┆ {"messi","python"} ┆ MESSI │
|
1029
|
+
# # │ http://vote.com/ballon_dor?can… ┆ {"weghorst","polars"} ┆ WEGHORST │
|
1030
|
+
# # │ http://vote.com/ballon_dor?err… ┆ {null,null} ┆ null │
|
1031
|
+
# # └─────────────────────────────────┴───────────────────────┴──────────┘
|
1032
|
+
def extract_groups(pattern)
|
1033
|
+
Utils.wrap_expr(_rbexpr.str_extract_groups(pattern))
|
1034
|
+
end
|
1035
|
+
|
1036
|
+
# Count all successive non-overlapping regex matches.
|
1037
|
+
#
|
1038
|
+
# @param pattern [String]
|
1039
|
+
# A valid regex pattern
|
1040
|
+
#
|
1041
|
+
# @return [Expr]
|
1042
|
+
#
|
1043
|
+
# @example
|
1044
|
+
# df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
|
1045
|
+
# df.select(
|
1046
|
+
# [
|
1047
|
+
# Polars.col("foo").str.count_match('\d').alias("count_digits")
|
1048
|
+
# ]
|
1049
|
+
# )
|
1050
|
+
# # =>
|
1051
|
+
# # shape: (2, 1)
|
1052
|
+
# # ┌──────────────┐
|
1053
|
+
# # │ count_digits │
|
1054
|
+
# # │ --- │
|
1055
|
+
# # │ u32 │
|
1056
|
+
# # ╞══════════════╡
|
1057
|
+
# # │ 5 │
|
1058
|
+
# # │ 6 │
|
1059
|
+
# # └──────────────┘
|
1060
|
+
def count_matches(pattern, literal: false)
|
1061
|
+
pattern = Utils.parse_into_expression(pattern, str_as_lit: true)
|
1062
|
+
Utils.wrap_expr(_rbexpr.str_count_matches(pattern, literal))
|
1063
|
+
end
|
1064
|
+
alias_method :count_match, :count_matches
|
1065
|
+
|
1066
|
+
# Split the string by a substring.
|
1067
|
+
#
|
1068
|
+
# @param by [String]
|
1069
|
+
# Substring to split by.
|
1070
|
+
# @param inclusive [Boolean]
|
1071
|
+
# If true, include the split character/string in the results.
|
1072
|
+
#
|
1073
|
+
# @return [Expr]
|
1074
|
+
#
|
1075
|
+
# @example
|
1076
|
+
# df = Polars::DataFrame.new({"s" => ["foo bar", "foo-bar", "foo bar baz"]})
|
1077
|
+
# df.select(Polars.col("s").str.split(" "))
|
1078
|
+
# # =>
|
1079
|
+
# # shape: (3, 1)
|
1080
|
+
# # ┌───────────────────────┐
|
1081
|
+
# # │ s │
|
1082
|
+
# # │ --- │
|
1083
|
+
# # │ list[str] │
|
1084
|
+
# # ╞═══════════════════════╡
|
1085
|
+
# # │ ["foo", "bar"] │
|
1086
|
+
# # │ ["foo-bar"] │
|
1087
|
+
# # │ ["foo", "bar", "baz"] │
|
1088
|
+
# # └───────────────────────┘
|
1089
|
+
def split(by, inclusive: false)
|
1090
|
+
by = Utils.parse_into_expression(by, str_as_lit: true)
|
1091
|
+
if inclusive
|
1092
|
+
return Utils.wrap_expr(_rbexpr.str_split_inclusive(by))
|
1093
|
+
end
|
1094
|
+
Utils.wrap_expr(_rbexpr.str_split(by))
|
1095
|
+
end
|
1096
|
+
|
1097
|
+
# Split the string by a substring using `n` splits.
|
1098
|
+
#
|
1099
|
+
# Results in a struct of `n+1` fields.
|
1100
|
+
#
|
1101
|
+
# If it cannot make `n` splits, the remaining field elements will be null.
|
1102
|
+
#
|
1103
|
+
# @param by [String]
|
1104
|
+
# Substring to split by.
|
1105
|
+
# @param n [Integer]
|
1106
|
+
# Number of splits to make.
|
1107
|
+
# @param inclusive [Boolean]
|
1108
|
+
# If true, include the split character/string in the results.
|
1109
|
+
#
|
1110
|
+
# @return [Expr]
|
1111
|
+
#
|
1112
|
+
# @example
|
1113
|
+
# df = Polars::DataFrame.new({"x" => ["a_1", nil, "c", "d_4"]})
|
1114
|
+
# df.select(
|
1115
|
+
# [
|
1116
|
+
# Polars.col("x").str.split_exact("_", 1).alias("fields")
|
1117
|
+
# ]
|
1118
|
+
# )
|
1119
|
+
# # =>
|
1120
|
+
# # shape: (4, 1)
|
1121
|
+
# # ┌─────────────┐
|
1122
|
+
# # │ fields │
|
1123
|
+
# # │ --- │
|
1124
|
+
# # │ struct[2] │
|
1125
|
+
# # ╞═════════════╡
|
1126
|
+
# # │ {"a","1"} │
|
1127
|
+
# # │ {null,null} │
|
1128
|
+
# # │ {"c",null} │
|
1129
|
+
# # │ {"d","4"} │
|
1130
|
+
# # └─────────────┘
|
1131
|
+
def split_exact(by, n, inclusive: false)
|
1132
|
+
by = Utils.parse_into_expression(by, str_as_lit: true)
|
1133
|
+
if inclusive
|
1134
|
+
Utils.wrap_expr(_rbexpr.str_split_exact_inclusive(by, n))
|
1135
|
+
else
|
1136
|
+
Utils.wrap_expr(_rbexpr.str_split_exact(by, n))
|
1137
|
+
end
|
1138
|
+
end
|
1139
|
+
|
1140
|
+
# Split the string by a substring, restricted to returning at most `n` items.
|
1141
|
+
#
|
1142
|
+
# If the number of possible splits is less than `n-1`, the remaining field
|
1143
|
+
# elements will be null. If the number of possible splits is `n-1` or greater,
|
1144
|
+
# the last (nth) substring will contain the remainder of the string.
|
1145
|
+
#
|
1146
|
+
# @param by [String]
|
1147
|
+
# Substring to split by.
|
1148
|
+
# @param n [Integer]
|
1149
|
+
# Max number of items to return.
|
1150
|
+
#
|
1151
|
+
# @return [Expr]
|
1152
|
+
#
|
1153
|
+
# @example
|
1154
|
+
# df = Polars::DataFrame.new({"s" => ["foo bar", nil, "foo-bar", "foo bar baz"]})
|
1155
|
+
# df.select(Polars.col("s").str.splitn(" ", 2).alias("fields"))
|
1156
|
+
# # =>
|
1157
|
+
# # shape: (4, 1)
|
1158
|
+
# # ┌───────────────────┐
|
1159
|
+
# # │ fields │
|
1160
|
+
# # │ --- │
|
1161
|
+
# # │ struct[2] │
|
1162
|
+
# # ╞═══════════════════╡
|
1163
|
+
# # │ {"foo","bar"} │
|
1164
|
+
# # │ {null,null} │
|
1165
|
+
# # │ {"foo-bar",null} │
|
1166
|
+
# # │ {"foo","bar baz"} │
|
1167
|
+
# # └───────────────────┘
|
1168
|
+
def splitn(by, n)
|
1169
|
+
by = Utils.parse_into_expression(by, str_as_lit: true)
|
1170
|
+
Utils.wrap_expr(_rbexpr.str_splitn(by, n))
|
1171
|
+
end
|
1172
|
+
|
1173
|
+
# Replace first matching regex/literal substring with a new string value.
|
1174
|
+
#
|
1175
|
+
# @param pattern [String]
|
1176
|
+
# Regex pattern.
|
1177
|
+
# @param value [String]
|
1178
|
+
# Replacement string.
|
1179
|
+
# @param literal [Boolean]
|
1180
|
+
# Treat pattern as a literal string.
|
1181
|
+
#
|
1182
|
+
# @return [Expr]
|
1183
|
+
#
|
1184
|
+
# @example
|
1185
|
+
# df = Polars::DataFrame.new({"id" => [1, 2], "text" => ["123abc", "abc456"]})
|
1186
|
+
# df.with_column(
|
1187
|
+
# Polars.col("text").str.replace('abc\b', "ABC")
|
1188
|
+
# )
|
1189
|
+
# # =>
|
1190
|
+
# # shape: (2, 2)
|
1191
|
+
# # ┌─────┬────────┐
|
1192
|
+
# # │ id ┆ text │
|
1193
|
+
# # │ --- ┆ --- │
|
1194
|
+
# # │ i64 ┆ str │
|
1195
|
+
# # ╞═════╪════════╡
|
1196
|
+
# # │ 1 ┆ 123ABC │
|
1197
|
+
# # │ 2 ┆ abc456 │
|
1198
|
+
# # └─────┴────────┘
|
1199
|
+
def replace(pattern, value, literal: false, n: 1)
|
1200
|
+
pattern = Utils.parse_into_expression(pattern, str_as_lit: true)
|
1201
|
+
value = Utils.parse_into_expression(value, str_as_lit: true)
|
1202
|
+
Utils.wrap_expr(_rbexpr.str_replace_n(pattern, value, literal, n))
|
1203
|
+
end
|
1204
|
+
|
1205
|
+
# Replace all matching regex/literal substrings with a new string value.
|
1206
|
+
#
|
1207
|
+
# @param pattern [String]
|
1208
|
+
# Regex pattern.
|
1209
|
+
# @param value [String]
|
1210
|
+
# Replacement string.
|
1211
|
+
# @param literal [Boolean]
|
1212
|
+
# Treat pattern as a literal string.
|
1213
|
+
#
|
1214
|
+
# @return [Expr]
|
1215
|
+
#
|
1216
|
+
# @example
|
1217
|
+
# df = Polars::DataFrame.new({"id" => [1, 2], "text" => ["abcabc", "123a123"]})
|
1218
|
+
# df.with_column(Polars.col("text").str.replace_all("a", "-"))
|
1219
|
+
# # =>
|
1220
|
+
# # shape: (2, 2)
|
1221
|
+
# # ┌─────┬─────────┐
|
1222
|
+
# # │ id ┆ text │
|
1223
|
+
# # │ --- ┆ --- │
|
1224
|
+
# # │ i64 ┆ str │
|
1225
|
+
# # ╞═════╪═════════╡
|
1226
|
+
# # │ 1 ┆ -bc-bc │
|
1227
|
+
# # │ 2 ┆ 123-123 │
|
1228
|
+
# # └─────┴─────────┘
|
1229
|
+
def replace_all(pattern, value, literal: false)
|
1230
|
+
pattern = Utils.parse_into_expression(pattern, str_as_lit: true)
|
1231
|
+
value = Utils.parse_into_expression(value, str_as_lit: true)
|
1232
|
+
Utils.wrap_expr(_rbexpr.str_replace_all(pattern, value, literal))
|
1233
|
+
end
|
1234
|
+
|
1235
|
+
# Returns string values in reversed order.
|
1236
|
+
#
|
1237
|
+
# @return [Expr]
|
1238
|
+
#
|
1239
|
+
# @example
|
1240
|
+
# df = Polars::DataFrame.new({"text" => ["foo", "bar", "man\u0303ana"]})
|
1241
|
+
# df.with_columns(Polars.col("text").str.reverse.alias("reversed"))
|
1242
|
+
# # =>
|
1243
|
+
# # shape: (3, 2)
|
1244
|
+
# # ┌────────┬──────────┐
|
1245
|
+
# # │ text ┆ reversed │
|
1246
|
+
# # │ --- ┆ --- │
|
1247
|
+
# # │ str ┆ str │
|
1248
|
+
# # ╞════════╪══════════╡
|
1249
|
+
# # │ foo ┆ oof │
|
1250
|
+
# # │ bar ┆ rab │
|
1251
|
+
# # │ mañana ┆ anañam │
|
1252
|
+
# # └────────┴──────────┘
|
1253
|
+
def reverse
|
1254
|
+
Utils.wrap_expr(_rbexpr.str_reverse)
|
1255
|
+
end
|
1256
|
+
|
1257
|
+
# Create subslices of the string values of a Utf8 Series.
|
1258
|
+
#
|
1259
|
+
# @param offset [Integer]
|
1260
|
+
# Start index. Negative indexing is supported.
|
1261
|
+
# @param length [Integer]
|
1262
|
+
# Length of the slice. If set to `nil` (default), the slice is taken to the
|
1263
|
+
# end of the string.
|
1264
|
+
#
|
1265
|
+
# @return [Expr]
|
1266
|
+
#
|
1267
|
+
# @example
|
1268
|
+
# df = Polars::DataFrame.new({"s" => ["pear", nil, "papaya", "dragonfruit"]})
|
1269
|
+
# df.with_column(
|
1270
|
+
# Polars.col("s").str.slice(-3).alias("s_sliced")
|
1271
|
+
# )
|
1272
|
+
# # =>
|
1273
|
+
# # shape: (4, 2)
|
1274
|
+
# # ┌─────────────┬──────────┐
|
1275
|
+
# # │ s ┆ s_sliced │
|
1276
|
+
# # │ --- ┆ --- │
|
1277
|
+
# # │ str ┆ str │
|
1278
|
+
# # ╞═════════════╪══════════╡
|
1279
|
+
# # │ pear ┆ ear │
|
1280
|
+
# # │ null ┆ null │
|
1281
|
+
# # │ papaya ┆ aya │
|
1282
|
+
# # │ dragonfruit ┆ uit │
|
1283
|
+
# # └─────────────┴──────────┘
|
1284
|
+
def slice(offset, length = nil)
|
1285
|
+
offset = Utils.parse_into_expression(offset)
|
1286
|
+
length = Utils.parse_into_expression(length)
|
1287
|
+
Utils.wrap_expr(_rbexpr.str_slice(offset, length))
|
1288
|
+
end
|
1289
|
+
|
1290
|
+
# Convert an Utf8 column into an Int64 column with base radix.
|
1291
|
+
#
|
1292
|
+
# @param base [Integer]
|
1293
|
+
# Positive integer which is the base of the string we are parsing.
|
1294
|
+
# Default: 10.
|
1295
|
+
# @param strict [Boolean]
|
1296
|
+
# Bool, default=true will raise any ParseError or overflow as ComputeError.
|
1297
|
+
# false silently convert to Null.
|
1298
|
+
#
|
1299
|
+
# @return [Expr]
|
1300
|
+
#
|
1301
|
+
# @example
|
1302
|
+
# df = Polars::DataFrame.new({"bin" => ["110", "101", "010", "invalid"]})
|
1303
|
+
# df.with_columns(Polars.col("bin").str.to_integer(base: 2, strict: false).alias("parsed"))
|
1304
|
+
# # =>
|
1305
|
+
# # shape: (4, 2)
|
1306
|
+
# # ┌─────────┬────────┐
|
1307
|
+
# # │ bin ┆ parsed │
|
1308
|
+
# # │ --- ┆ --- │
|
1309
|
+
# # │ str ┆ i64 │
|
1310
|
+
# # ╞═════════╪════════╡
|
1311
|
+
# # │ 110 ┆ 6 │
|
1312
|
+
# # │ 101 ┆ 5 │
|
1313
|
+
# # │ 010 ┆ 2 │
|
1314
|
+
# # │ invalid ┆ null │
|
1315
|
+
# # └─────────┴────────┘
|
1316
|
+
#
|
1317
|
+
# @example
|
1318
|
+
# df = Polars::DataFrame.new({"hex" => ["fa1e", "ff00", "cafe", nil]})
|
1319
|
+
# df.with_columns(Polars.col("hex").str.to_integer(base: 16, strict: true).alias("parsed"))
|
1320
|
+
# # =>
|
1321
|
+
# # shape: (4, 2)
|
1322
|
+
# # ┌──────┬────────┐
|
1323
|
+
# # │ hex ┆ parsed │
|
1324
|
+
# # │ --- ┆ --- │
|
1325
|
+
# # │ str ┆ i64 │
|
1326
|
+
# # ╞══════╪════════╡
|
1327
|
+
# # │ fa1e ┆ 64030 │
|
1328
|
+
# # │ ff00 ┆ 65280 │
|
1329
|
+
# # │ cafe ┆ 51966 │
|
1330
|
+
# # │ null ┆ null │
|
1331
|
+
# # └──────┴────────┘
|
1332
|
+
def to_integer(base: 10, strict: true)
|
1333
|
+
base = Utils.parse_into_expression(base, str_as_lit: false)
|
1334
|
+
Utils.wrap_expr(_rbexpr.str_to_integer(base, strict))
|
1335
|
+
end
|
1336
|
+
|
1337
|
+
# Parse integers with base radix from strings.
|
1338
|
+
#
|
1339
|
+
# By default base 2. ParseError/Overflows become Nulls.
|
1340
|
+
#
|
1341
|
+
# @param radix [Integer]
|
1342
|
+
# Positive integer which is the base of the string we are parsing.
|
1343
|
+
# Default: 2.
|
1344
|
+
# @param strict [Boolean]
|
1345
|
+
# Bool, Default=true will raise any ParseError or overflow as ComputeError.
|
1346
|
+
# False silently convert to Null.
|
1347
|
+
#
|
1348
|
+
# @return [Expr]
|
1349
|
+
#
|
1350
|
+
# @example
|
1351
|
+
# df = Polars::DataFrame.new({"bin" => ["110", "101", "010", "invalid"]})
|
1352
|
+
# df.select(Polars.col("bin").str.parse_int(2, strict: false))
|
1353
|
+
# # =>
|
1354
|
+
# # shape: (4, 1)
|
1355
|
+
# # ┌──────┐
|
1356
|
+
# # │ bin │
|
1357
|
+
# # │ --- │
|
1358
|
+
# # │ i32 │
|
1359
|
+
# # ╞══════╡
|
1360
|
+
# # │ 6 │
|
1361
|
+
# # │ 5 │
|
1362
|
+
# # │ 2 │
|
1363
|
+
# # │ null │
|
1364
|
+
# # └──────┘
|
1365
|
+
def parse_int(radix = 2, strict: true)
|
1366
|
+
to_integer(base: 2, strict: strict).cast(Int32, strict: strict)
|
1367
|
+
end
|
1368
|
+
|
1369
|
+
# Use the aho-corasick algorithm to find matches.
|
1370
|
+
#
|
1371
|
+
# This version determines if any of the patterns find a match.
|
1372
|
+
#
|
1373
|
+
# @param patterns [String]
|
1374
|
+
# String patterns to search.
|
1375
|
+
# @param ascii_case_insensitive [Boolean]
|
1376
|
+
# Enable ASCII-aware case insensitive matching.
|
1377
|
+
# When this option is enabled, searching will be performed without respect
|
1378
|
+
# to case for ASCII letters (a-z and A-Z) only.
|
1379
|
+
#
|
1380
|
+
# @return [Expr]
|
1381
|
+
#
|
1382
|
+
# @example
|
1383
|
+
# df = Polars::DataFrame.new(
|
1384
|
+
# {
|
1385
|
+
# "lyrics": [
|
1386
|
+
# "Everybody wants to rule the world",
|
1387
|
+
# "Tell me what you want, what you really really want",
|
1388
|
+
# "Can you feel the love tonight"
|
1389
|
+
# ]
|
1390
|
+
# }
|
1391
|
+
# )
|
1392
|
+
# df.with_columns(
|
1393
|
+
# Polars.col("lyrics").str.contains_any(["you", "me"]).alias("contains_any")
|
1394
|
+
# )
|
1395
|
+
# # =>
|
1396
|
+
# # shape: (3, 2)
|
1397
|
+
# # ┌─────────────────────────────────┬──────────────┐
|
1398
|
+
# # │ lyrics ┆ contains_any │
|
1399
|
+
# # │ --- ┆ --- │
|
1400
|
+
# # │ str ┆ bool │
|
1401
|
+
# # ╞═════════════════════════════════╪══════════════╡
|
1402
|
+
# # │ Everybody wants to rule the wo… ┆ false │
|
1403
|
+
# # │ Tell me what you want, what yo… ┆ true │
|
1404
|
+
# # │ Can you feel the love tonight ┆ true │
|
1405
|
+
# # └─────────────────────────────────┴──────────────┘
|
1406
|
+
def contains_any(patterns, ascii_case_insensitive: false)
|
1407
|
+
patterns = Utils.parse_into_expression(patterns, str_as_lit: false, list_as_series: true)
|
1408
|
+
Utils.wrap_expr(
|
1409
|
+
_rbexpr.str_contains_any(patterns, ascii_case_insensitive)
|
1410
|
+
)
|
1411
|
+
end
|
1412
|
+
|
1413
|
+
# Use the aho-corasick algorithm to replace many matches.
|
1414
|
+
#
|
1415
|
+
# @param patterns [String]
|
1416
|
+
# String patterns to search and replace.
|
1417
|
+
# @param replace_with [String]
|
1418
|
+
# Strings to replace where a pattern was a match.
|
1419
|
+
# This can be broadcasted. So it supports many:one and many:many.
|
1420
|
+
# @param ascii_case_insensitive [Boolean]
|
1421
|
+
# Enable ASCII-aware case insensitive matching.
|
1422
|
+
# When this option is enabled, searching will be performed without respect
|
1423
|
+
# to case for ASCII letters (a-z and A-Z) only.
|
1424
|
+
#
|
1425
|
+
# @return [Expr]
|
1426
|
+
#
|
1427
|
+
# @example
|
1428
|
+
# df = Polars::DataFrame.new(
|
1429
|
+
# {
|
1430
|
+
# "lyrics": [
|
1431
|
+
# "Everybody wants to rule the world",
|
1432
|
+
# "Tell me what you want, what you really really want",
|
1433
|
+
# "Can you feel the love tonight"
|
1434
|
+
# ]
|
1435
|
+
# }
|
1436
|
+
# )
|
1437
|
+
# df.with_columns(
|
1438
|
+
# Polars.col("lyrics")
|
1439
|
+
# .str.replace_many(
|
1440
|
+
# ["me", "you", "they"],
|
1441
|
+
# ""
|
1442
|
+
# )
|
1443
|
+
# .alias("removes_pronouns")
|
1444
|
+
# )
|
1445
|
+
# # =>
|
1446
|
+
# # shape: (3, 2)
|
1447
|
+
# # ┌─────────────────────────────────┬─────────────────────────────────┐
|
1448
|
+
# # │ lyrics ┆ removes_pronouns │
|
1449
|
+
# # │ --- ┆ --- │
|
1450
|
+
# # │ str ┆ str │
|
1451
|
+
# # ╞═════════════════════════════════╪═════════════════════════════════╡
|
1452
|
+
# # │ Everybody wants to rule the wo… ┆ Everybody wants to rule the wo… │
|
1453
|
+
# # │ Tell me what you want, what yo… ┆ Tell what want, what really… │
|
1454
|
+
# # │ Can you feel the love tonight ┆ Can feel the love tonight │
|
1455
|
+
# # └─────────────────────────────────┴─────────────────────────────────┘
|
1456
|
+
#
|
1457
|
+
# @example
|
1458
|
+
# df.with_columns(
|
1459
|
+
# Polars.col("lyrics")
|
1460
|
+
# .str.replace_many(
|
1461
|
+
# ["me", "you"],
|
1462
|
+
# ["you", "me"]
|
1463
|
+
# )
|
1464
|
+
# .alias("confusing")
|
1465
|
+
# )
|
1466
|
+
# # =>
|
1467
|
+
# # shape: (3, 2)
|
1468
|
+
# # ┌─────────────────────────────────┬─────────────────────────────────┐
|
1469
|
+
# # │ lyrics ┆ confusing │
|
1470
|
+
# # │ --- ┆ --- │
|
1471
|
+
# # │ str ┆ str │
|
1472
|
+
# # ╞═════════════════════════════════╪═════════════════════════════════╡
|
1473
|
+
# # │ Everybody wants to rule the wo… ┆ Everybody wants to rule the wo… │
|
1474
|
+
# # │ Tell me what you want, what yo… ┆ Tell you what me want, what me… │
|
1475
|
+
# # │ Can you feel the love tonight ┆ Can me feel the love tonight │
|
1476
|
+
# # └─────────────────────────────────┴─────────────────────────────────┘
|
1477
|
+
def replace_many(patterns, replace_with, ascii_case_insensitive: false)
|
1478
|
+
patterns = Utils.parse_into_expression(patterns, str_as_lit: false, list_as_series: true)
|
1479
|
+
replace_with = Utils.parse_into_expression(
|
1480
|
+
replace_with, str_as_lit: true, list_as_series: true
|
1481
|
+
)
|
1482
|
+
Utils.wrap_expr(
|
1483
|
+
_rbexpr.str_replace_many(
|
1484
|
+
patterns, replace_with, ascii_case_insensitive
|
1485
|
+
)
|
1486
|
+
)
|
1487
|
+
end
|
1488
|
+
|
1489
|
+
private
|
1490
|
+
|
1491
|
+
def _validate_format_argument(format)
|
1492
|
+
# TODO
|
1493
|
+
end
|
1494
|
+
end
|
1495
|
+
end
|