polars-df 0.13.0-aarch64-linux-musl
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +3 -0
- data/CHANGELOG.md +208 -0
- data/Cargo.lock +2556 -0
- data/Cargo.toml +6 -0
- data/LICENSE-THIRD-PARTY.txt +39059 -0
- data/LICENSE.txt +20 -0
- data/README.md +437 -0
- data/lib/polars/3.1/polars.so +0 -0
- data/lib/polars/3.2/polars.so +0 -0
- data/lib/polars/3.3/polars.so +0 -0
- data/lib/polars/array_expr.rb +537 -0
- data/lib/polars/array_name_space.rb +423 -0
- data/lib/polars/batched_csv_reader.rb +104 -0
- data/lib/polars/binary_expr.rb +77 -0
- data/lib/polars/binary_name_space.rb +66 -0
- data/lib/polars/cat_expr.rb +36 -0
- data/lib/polars/cat_name_space.rb +88 -0
- data/lib/polars/config.rb +530 -0
- data/lib/polars/convert.rb +98 -0
- data/lib/polars/data_frame.rb +5191 -0
- data/lib/polars/data_types.rb +466 -0
- data/lib/polars/date_time_expr.rb +1397 -0
- data/lib/polars/date_time_name_space.rb +1287 -0
- data/lib/polars/dynamic_group_by.rb +52 -0
- data/lib/polars/exceptions.rb +38 -0
- data/lib/polars/expr.rb +7256 -0
- data/lib/polars/expr_dispatch.rb +22 -0
- data/lib/polars/functions/aggregation/horizontal.rb +246 -0
- data/lib/polars/functions/aggregation/vertical.rb +282 -0
- data/lib/polars/functions/as_datatype.rb +271 -0
- data/lib/polars/functions/col.rb +47 -0
- data/lib/polars/functions/eager.rb +182 -0
- data/lib/polars/functions/lazy.rb +1329 -0
- data/lib/polars/functions/len.rb +49 -0
- data/lib/polars/functions/lit.rb +35 -0
- data/lib/polars/functions/random.rb +16 -0
- data/lib/polars/functions/range/date_range.rb +136 -0
- data/lib/polars/functions/range/datetime_range.rb +149 -0
- data/lib/polars/functions/range/int_range.rb +51 -0
- data/lib/polars/functions/range/time_range.rb +141 -0
- data/lib/polars/functions/repeat.rb +144 -0
- data/lib/polars/functions/whenthen.rb +96 -0
- data/lib/polars/functions.rb +57 -0
- data/lib/polars/group_by.rb +613 -0
- data/lib/polars/io/avro.rb +24 -0
- data/lib/polars/io/csv.rb +696 -0
- data/lib/polars/io/database.rb +73 -0
- data/lib/polars/io/ipc.rb +275 -0
- data/lib/polars/io/json.rb +29 -0
- data/lib/polars/io/ndjson.rb +80 -0
- data/lib/polars/io/parquet.rb +233 -0
- data/lib/polars/lazy_frame.rb +2708 -0
- data/lib/polars/lazy_group_by.rb +181 -0
- data/lib/polars/list_expr.rb +791 -0
- data/lib/polars/list_name_space.rb +449 -0
- data/lib/polars/meta_expr.rb +222 -0
- data/lib/polars/name_expr.rb +198 -0
- data/lib/polars/plot.rb +109 -0
- data/lib/polars/rolling_group_by.rb +35 -0
- data/lib/polars/series.rb +4444 -0
- data/lib/polars/slice.rb +104 -0
- data/lib/polars/sql_context.rb +194 -0
- data/lib/polars/string_cache.rb +75 -0
- data/lib/polars/string_expr.rb +1495 -0
- data/lib/polars/string_name_space.rb +811 -0
- data/lib/polars/struct_expr.rb +98 -0
- data/lib/polars/struct_name_space.rb +96 -0
- data/lib/polars/testing.rb +507 -0
- data/lib/polars/utils/constants.rb +9 -0
- data/lib/polars/utils/convert.rb +97 -0
- data/lib/polars/utils/parse.rb +89 -0
- data/lib/polars/utils/various.rb +76 -0
- data/lib/polars/utils/wrap.rb +19 -0
- data/lib/polars/utils.rb +130 -0
- data/lib/polars/version.rb +4 -0
- data/lib/polars/whenthen.rb +83 -0
- data/lib/polars-df.rb +1 -0
- data/lib/polars.rb +91 -0
- metadata +138 -0
@@ -0,0 +1,1495 @@
|
|
1
|
+
module Polars
|
2
|
+
# Namespace for string related expressions.
|
3
|
+
class StringExpr
|
4
|
+
# @private
|
5
|
+
attr_accessor :_rbexpr
|
6
|
+
|
7
|
+
# @private
|
8
|
+
def initialize(expr)
|
9
|
+
self._rbexpr = expr._rbexpr
|
10
|
+
end
|
11
|
+
|
12
|
+
# Convert a Utf8 column into a Date column.
|
13
|
+
#
|
14
|
+
# @param format [String]
|
15
|
+
# Format to use for conversion. Refer to the
|
16
|
+
# [chrono crate documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
|
17
|
+
# for the full specification. Example: `"%Y-%m-%d"`.
|
18
|
+
# If set to nil (default), the format is inferred from the data.
|
19
|
+
# @param strict [Boolean]
|
20
|
+
# Raise an error if any conversion fails.
|
21
|
+
# @param exact [Boolean]
|
22
|
+
# Require an exact format match. If false, allow the format to match anywhere
|
23
|
+
# in the target string.
|
24
|
+
# @param cache [Boolean]
|
25
|
+
# Use a cache of unique, converted dates to apply the conversion.
|
26
|
+
#
|
27
|
+
# @return [Expr]
|
28
|
+
#
|
29
|
+
# @example
|
30
|
+
# s = Polars::Series.new(["2020/01/01", "2020/02/01", "2020/03/01"])
|
31
|
+
# s.str.to_date
|
32
|
+
# # =>
|
33
|
+
# # shape: (3,)
|
34
|
+
# # Series: '' [date]
|
35
|
+
# # [
|
36
|
+
# # 2020-01-01
|
37
|
+
# # 2020-02-01
|
38
|
+
# # 2020-03-01
|
39
|
+
# # ]
|
40
|
+
def to_date(format = nil, strict: true, exact: true, cache: true)
|
41
|
+
_validate_format_argument(format)
|
42
|
+
Utils.wrap_expr(_rbexpr.str_to_date(format, strict, exact, cache))
|
43
|
+
end
|
44
|
+
|
45
|
+
# Convert a Utf8 column into a Datetime column.
|
46
|
+
#
|
47
|
+
# @param format [String]
|
48
|
+
# Format to use for conversion. Refer to the
|
49
|
+
# [chrono crate documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
|
50
|
+
# for the full specification. Example: `"%Y-%m-%d %H:%M:%S"`.
|
51
|
+
# If set to nil (default), the format is inferred from the data.
|
52
|
+
# @param time_unit ["us", "ns", "ms"]
|
53
|
+
# Unit of time for the resulting Datetime column. If set to nil (default),
|
54
|
+
# the time unit is inferred from the format string if given, eg:
|
55
|
+
# `"%F %T%.3f"` => `Datetime("ms")`. If no fractional second component is
|
56
|
+
# found, the default is `"us"`.
|
57
|
+
# @param time_zone [String]
|
58
|
+
# Time zone for the resulting Datetime column.
|
59
|
+
# @param strict [Boolean]
|
60
|
+
# Raise an error if any conversion fails.
|
61
|
+
# @param exact [Boolean]
|
62
|
+
# Require an exact format match. If false, allow the format to match anywhere
|
63
|
+
# in the target string.
|
64
|
+
# @param cache [Boolean]
|
65
|
+
# Use a cache of unique, converted datetimes to apply the conversion.
|
66
|
+
#
|
67
|
+
# @return [Expr]
|
68
|
+
#
|
69
|
+
# @example
|
70
|
+
# s = Polars::Series.new(["2020-01-01 01:00Z", "2020-01-01 02:00Z"])
|
71
|
+
# s.str.to_datetime("%Y-%m-%d %H:%M%#z")
|
72
|
+
# # =>
|
73
|
+
# # shape: (2,)
|
74
|
+
# # Series: '' [datetime[μs, UTC]]
|
75
|
+
# # [
|
76
|
+
# # 2020-01-01 01:00:00 UTC
|
77
|
+
# # 2020-01-01 02:00:00 UTC
|
78
|
+
# # ]
|
79
|
+
def to_datetime(
|
80
|
+
format = nil,
|
81
|
+
time_unit: nil,
|
82
|
+
time_zone: nil,
|
83
|
+
strict: true,
|
84
|
+
exact: true,
|
85
|
+
cache: true,
|
86
|
+
ambiguous: "raise"
|
87
|
+
)
|
88
|
+
_validate_format_argument(format)
|
89
|
+
unless ambiguous.is_a?(Expr)
|
90
|
+
ambiguous = Polars.lit(ambiguous)
|
91
|
+
end
|
92
|
+
Utils.wrap_expr(
|
93
|
+
_rbexpr.str_to_datetime(
|
94
|
+
format,
|
95
|
+
time_unit,
|
96
|
+
time_zone,
|
97
|
+
strict,
|
98
|
+
exact,
|
99
|
+
cache,
|
100
|
+
ambiguous._rbexpr
|
101
|
+
)
|
102
|
+
)
|
103
|
+
end
|
104
|
+
|
105
|
+
# Convert a Utf8 column into a Time column.
|
106
|
+
#
|
107
|
+
# @param format [String]
|
108
|
+
# Format to use for conversion. Refer to the
|
109
|
+
# [chrono crate documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
|
110
|
+
# for the full specification. Example: `"%H:%M:%S"`.
|
111
|
+
# If set to nil (default), the format is inferred from the data.
|
112
|
+
# @param strict [Boolean]
|
113
|
+
# Raise an error if any conversion fails.
|
114
|
+
# @param cache [Boolean]
|
115
|
+
# Use a cache of unique, converted times to apply the conversion.
|
116
|
+
#
|
117
|
+
# @return [Expr]
|
118
|
+
#
|
119
|
+
# @example
|
120
|
+
# s = Polars::Series.new(["01:00", "02:00", "03:00"])
|
121
|
+
# s.str.to_time("%H:%M")
|
122
|
+
# # =>
|
123
|
+
# # shape: (3,)
|
124
|
+
# # Series: '' [time]
|
125
|
+
# # [
|
126
|
+
# # 01:00:00
|
127
|
+
# # 02:00:00
|
128
|
+
# # 03:00:00
|
129
|
+
# # ]
|
130
|
+
def to_time(format = nil, strict: true, cache: true)
|
131
|
+
_validate_format_argument(format)
|
132
|
+
Utils.wrap_expr(_rbexpr.str_to_time(format, strict, cache))
|
133
|
+
end
|
134
|
+
|
135
|
+
# Parse a Utf8 expression to a Date/Datetime/Time type.
|
136
|
+
#
|
137
|
+
# @param dtype [Object]
|
138
|
+
# The data type to convert into. Can be either Date, Datetime, or Time.
|
139
|
+
# @param format [String]
|
140
|
+
# Format to use, refer to the
|
141
|
+
# [chrono strftime documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
|
142
|
+
# for specification. Example: `"%y-%m-%d"`.
|
143
|
+
# @param strict [Boolean]
|
144
|
+
# Raise an error if any conversion fails.
|
145
|
+
# @param exact [Boolean]
|
146
|
+
# - If true, require an exact format match.
|
147
|
+
# - If false, allow the format to match anywhere in the target string.
|
148
|
+
# @param utc [Boolean]
|
149
|
+
# Parse timezone aware datetimes as UTC. This may be useful if you have data
|
150
|
+
# with mixed offsets.
|
151
|
+
#
|
152
|
+
# @return [Expr]
|
153
|
+
#
|
154
|
+
# @note
|
155
|
+
# When parsing a Datetime the column precision will be inferred from
|
156
|
+
# the format string, if given, eg: "%F %T%.3f" => Datetime("ms"). If
|
157
|
+
# no fractional second component is found then the default is "us".
|
158
|
+
#
|
159
|
+
# @example Dealing with a consistent format:
|
160
|
+
# s = Polars::Series.new(["2020-01-01 01:00Z", "2020-01-01 02:00Z"])
|
161
|
+
# s.str.strptime(Polars::Datetime, "%Y-%m-%d %H:%M%#z")
|
162
|
+
# # =>
|
163
|
+
# # shape: (2,)
|
164
|
+
# # Series: '' [datetime[μs, UTC]]
|
165
|
+
# # [
|
166
|
+
# # 2020-01-01 01:00:00 UTC
|
167
|
+
# # 2020-01-01 02:00:00 UTC
|
168
|
+
# # ]
|
169
|
+
#
|
170
|
+
# @example Dealing with different formats.
|
171
|
+
# s = Polars::Series.new(
|
172
|
+
# "date",
|
173
|
+
# [
|
174
|
+
# "2021-04-22",
|
175
|
+
# "2022-01-04 00:00:00",
|
176
|
+
# "01/31/22",
|
177
|
+
# "Sun Jul 8 00:34:60 2001",
|
178
|
+
# ]
|
179
|
+
# )
|
180
|
+
# s.to_frame.select(
|
181
|
+
# Polars.coalesce(
|
182
|
+
# Polars.col("date").str.strptime(Polars::Date, "%F", strict: false),
|
183
|
+
# Polars.col("date").str.strptime(Polars::Date, "%F %T", strict: false),
|
184
|
+
# Polars.col("date").str.strptime(Polars::Date, "%D", strict: false),
|
185
|
+
# Polars.col("date").str.strptime(Polars::Date, "%c", strict: false)
|
186
|
+
# )
|
187
|
+
# ).to_series
|
188
|
+
# # =>
|
189
|
+
# # shape: (4,)
|
190
|
+
# # Series: 'date' [date]
|
191
|
+
# # [
|
192
|
+
# # 2021-04-22
|
193
|
+
# # 2022-01-04
|
194
|
+
# # 2022-01-31
|
195
|
+
# # 2001-07-08
|
196
|
+
# # ]
|
197
|
+
def strptime(dtype, format = nil, strict: true, exact: true, cache: true, utc: false)
|
198
|
+
_validate_format_argument(format)
|
199
|
+
|
200
|
+
if dtype == Date
|
201
|
+
to_date(format, strict: strict, exact: exact, cache: cache)
|
202
|
+
elsif dtype == Datetime || dtype.is_a?(Datetime)
|
203
|
+
dtype = Datetime.new if dtype == Datetime
|
204
|
+
time_unit = dtype.time_unit
|
205
|
+
time_zone = dtype.time_zone
|
206
|
+
to_datetime(format, time_unit: time_unit, time_zone: time_zone, strict: strict, exact: exact, cache: cache)
|
207
|
+
elsif dtype == Time
|
208
|
+
to_time(format, strict: strict, cache: cache)
|
209
|
+
else
|
210
|
+
raise ArgumentError, "dtype should be of type {Date, Datetime, Time}"
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
# Convert a String column into a Decimal column.
|
215
|
+
#
|
216
|
+
# This method infers the needed parameters `precision` and `scale`.
|
217
|
+
#
|
218
|
+
# @param inference_length [Integer]
|
219
|
+
# Number of elements to parse to determine the `precision` and `scale`.
|
220
|
+
#
|
221
|
+
# @return [Expr]
|
222
|
+
#
|
223
|
+
# @example
|
224
|
+
# df = Polars::DataFrame.new(
|
225
|
+
# {
|
226
|
+
# "numbers": [
|
227
|
+
# "40.12",
|
228
|
+
# "3420.13",
|
229
|
+
# "120134.19",
|
230
|
+
# "3212.98",
|
231
|
+
# "12.90",
|
232
|
+
# "143.09",
|
233
|
+
# "143.9"
|
234
|
+
# ]
|
235
|
+
# }
|
236
|
+
# )
|
237
|
+
# df.with_columns(numbers_decimal: Polars.col("numbers").str.to_decimal)
|
238
|
+
# # =>
|
239
|
+
# # shape: (7, 2)
|
240
|
+
# # ┌───────────┬─────────────────┐
|
241
|
+
# # │ numbers ┆ numbers_decimal │
|
242
|
+
# # │ --- ┆ --- │
|
243
|
+
# # │ str ┆ decimal[*,2] │
|
244
|
+
# # ╞═══════════╪═════════════════╡
|
245
|
+
# # │ 40.12 ┆ 40.12 │
|
246
|
+
# # │ 3420.13 ┆ 3420.13 │
|
247
|
+
# # │ 120134.19 ┆ 120134.19 │
|
248
|
+
# # │ 3212.98 ┆ 3212.98 │
|
249
|
+
# # │ 12.90 ┆ 12.90 │
|
250
|
+
# # │ 143.09 ┆ 143.09 │
|
251
|
+
# # │ 143.9 ┆ 143.90 │
|
252
|
+
# # └───────────┴─────────────────┘
|
253
|
+
def to_decimal(inference_length = 100)
|
254
|
+
Utils.wrap_expr(_rbexpr.str_to_decimal(inference_length))
|
255
|
+
end
|
256
|
+
|
257
|
+
# Get length of the strings as `:u32` (as number of bytes).
|
258
|
+
#
|
259
|
+
# @return [Expr]
|
260
|
+
#
|
261
|
+
# @note
|
262
|
+
# The returned lengths are equal to the number of bytes in the UTF8 string. If you
|
263
|
+
# need the length in terms of the number of characters, use `n_chars` instead.
|
264
|
+
#
|
265
|
+
# @example
|
266
|
+
# df = Polars::DataFrame.new({"s" => ["Café", nil, "345", "東京"]}).with_columns(
|
267
|
+
# [
|
268
|
+
# Polars.col("s").str.len_bytes.alias("length"),
|
269
|
+
# Polars.col("s").str.len_chars.alias("nchars")
|
270
|
+
# ]
|
271
|
+
# )
|
272
|
+
# df
|
273
|
+
# # =>
|
274
|
+
# # shape: (4, 3)
|
275
|
+
# # ┌──────┬────────┬────────┐
|
276
|
+
# # │ s ┆ length ┆ nchars │
|
277
|
+
# # │ --- ┆ --- ┆ --- │
|
278
|
+
# # │ str ┆ u32 ┆ u32 │
|
279
|
+
# # ╞══════╪════════╪════════╡
|
280
|
+
# # │ Café ┆ 5 ┆ 4 │
|
281
|
+
# # │ null ┆ null ┆ null │
|
282
|
+
# # │ 345 ┆ 3 ┆ 3 │
|
283
|
+
# # │ 東京 ┆ 6 ┆ 2 │
|
284
|
+
# # └──────┴────────┴────────┘
|
285
|
+
def len_bytes
|
286
|
+
Utils.wrap_expr(_rbexpr.str_len_bytes)
|
287
|
+
end
|
288
|
+
alias_method :lengths, :len_bytes
|
289
|
+
|
290
|
+
# Get length of the strings as `:u32` (as number of chars).
|
291
|
+
#
|
292
|
+
# @return [Expr]
|
293
|
+
#
|
294
|
+
# @note
|
295
|
+
# If you know that you are working with ASCII text, `lengths` will be
|
296
|
+
# equivalent, and faster (returns length in terms of the number of bytes).
|
297
|
+
#
|
298
|
+
# @example
|
299
|
+
# df = Polars::DataFrame.new({"s" => ["Café", nil, "345", "東京"]}).with_columns(
|
300
|
+
# [
|
301
|
+
# Polars.col("s").str.len_bytes.alias("length"),
|
302
|
+
# Polars.col("s").str.len_chars.alias("nchars")
|
303
|
+
# ]
|
304
|
+
# )
|
305
|
+
# df
|
306
|
+
# # =>
|
307
|
+
# # shape: (4, 3)
|
308
|
+
# # ┌──────┬────────┬────────┐
|
309
|
+
# # │ s ┆ length ┆ nchars │
|
310
|
+
# # │ --- ┆ --- ┆ --- │
|
311
|
+
# # │ str ┆ u32 ┆ u32 │
|
312
|
+
# # ╞══════╪════════╪════════╡
|
313
|
+
# # │ Café ┆ 5 ┆ 4 │
|
314
|
+
# # │ null ┆ null ┆ null │
|
315
|
+
# # │ 345 ┆ 3 ┆ 3 │
|
316
|
+
# # │ 東京 ┆ 6 ┆ 2 │
|
317
|
+
# # └──────┴────────┴────────┘
|
318
|
+
def len_chars
|
319
|
+
Utils.wrap_expr(_rbexpr.str_len_chars)
|
320
|
+
end
|
321
|
+
alias_method :n_chars, :len_chars
|
322
|
+
|
323
|
+
# Vertically concat the values in the Series to a single string value.
|
324
|
+
#
|
325
|
+
# @param delimiter [String]
|
326
|
+
# The delimiter to insert between consecutive string values.
|
327
|
+
# @param ignore_nulls [Boolean]
|
328
|
+
# Ignore null values (default).
|
329
|
+
#
|
330
|
+
# @return [Expr]
|
331
|
+
#
|
332
|
+
# @example
|
333
|
+
# df = Polars::DataFrame.new({"foo" => [1, nil, 2]})
|
334
|
+
# df.select(Polars.col("foo").str.join("-"))
|
335
|
+
# # =>
|
336
|
+
# # shape: (1, 1)
|
337
|
+
# # ┌─────┐
|
338
|
+
# # │ foo │
|
339
|
+
# # │ --- │
|
340
|
+
# # │ str │
|
341
|
+
# # ╞═════╡
|
342
|
+
# # │ 1-2 │
|
343
|
+
# # └─────┘
|
344
|
+
#
|
345
|
+
# @example
|
346
|
+
# df = Polars::DataFrame.new({"foo" => [1, nil, 2]})
|
347
|
+
# df.select(Polars.col("foo").str.join("-", ignore_nulls: false))
|
348
|
+
# # =>
|
349
|
+
# # shape: (1, 1)
|
350
|
+
# # ┌──────┐
|
351
|
+
# # │ foo │
|
352
|
+
# # │ --- │
|
353
|
+
# # │ str │
|
354
|
+
# # ╞══════╡
|
355
|
+
# # │ null │
|
356
|
+
# # └──────┘
|
357
|
+
def join(delimiter = "-", ignore_nulls: true)
|
358
|
+
Utils.wrap_expr(_rbexpr.str_join(delimiter, ignore_nulls))
|
359
|
+
end
|
360
|
+
alias_method :concat, :join
|
361
|
+
|
362
|
+
# Transform to uppercase variant.
|
363
|
+
#
|
364
|
+
# @return [Expr]
|
365
|
+
#
|
366
|
+
# @example
|
367
|
+
# df = Polars::DataFrame.new({"foo" => ["cat", "dog"]})
|
368
|
+
# df.select(Polars.col("foo").str.to_uppercase)
|
369
|
+
# # =>
|
370
|
+
# # shape: (2, 1)
|
371
|
+
# # ┌─────┐
|
372
|
+
# # │ foo │
|
373
|
+
# # │ --- │
|
374
|
+
# # │ str │
|
375
|
+
# # ╞═════╡
|
376
|
+
# # │ CAT │
|
377
|
+
# # │ DOG │
|
378
|
+
# # └─────┘
|
379
|
+
def to_uppercase
|
380
|
+
Utils.wrap_expr(_rbexpr.str_to_uppercase)
|
381
|
+
end
|
382
|
+
|
383
|
+
# Transform to lowercase variant.
|
384
|
+
#
|
385
|
+
# @return [Expr]
|
386
|
+
#
|
387
|
+
# @example
|
388
|
+
# df = Polars::DataFrame.new({"foo" => ["CAT", "DOG"]})
|
389
|
+
# df.select(Polars.col("foo").str.to_lowercase)
|
390
|
+
# # =>
|
391
|
+
# # shape: (2, 1)
|
392
|
+
# # ┌─────┐
|
393
|
+
# # │ foo │
|
394
|
+
# # │ --- │
|
395
|
+
# # │ str │
|
396
|
+
# # ╞═════╡
|
397
|
+
# # │ cat │
|
398
|
+
# # │ dog │
|
399
|
+
# # └─────┘
|
400
|
+
def to_lowercase
|
401
|
+
Utils.wrap_expr(_rbexpr.str_to_lowercase)
|
402
|
+
end
|
403
|
+
|
404
|
+
# Transform to titlecase variant.
|
405
|
+
#
|
406
|
+
# @return [Expr]
|
407
|
+
#
|
408
|
+
# @example
|
409
|
+
# df = Polars::DataFrame.new(
|
410
|
+
# {"sing": ["welcome to my world", "THERE'S NO TURNING BACK"]}
|
411
|
+
# )
|
412
|
+
# df.with_columns(foo_title: Polars.col("sing").str.to_titlecase)
|
413
|
+
# # =>
|
414
|
+
# # shape: (2, 2)
|
415
|
+
# # ┌─────────────────────────┬─────────────────────────┐
|
416
|
+
# # │ sing ┆ foo_title │
|
417
|
+
# # │ --- ┆ --- │
|
418
|
+
# # │ str ┆ str │
|
419
|
+
# # ╞═════════════════════════╪═════════════════════════╡
|
420
|
+
# # │ welcome to my world ┆ Welcome To My World │
|
421
|
+
# # │ THERE'S NO TURNING BACK ┆ There's No Turning Back │
|
422
|
+
# # └─────────────────────────┴─────────────────────────┘
|
423
|
+
def to_titlecase
|
424
|
+
raise Todo
|
425
|
+
Utils.wrap_expr(_rbexpr.str_to_titlecase)
|
426
|
+
end
|
427
|
+
|
428
|
+
# Remove leading and trailing whitespace.
|
429
|
+
#
|
430
|
+
# @param characters [String, nil]
|
431
|
+
# An optional single character that should be trimmed.
|
432
|
+
#
|
433
|
+
# @return [Expr]
|
434
|
+
#
|
435
|
+
# @example
|
436
|
+
# df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
|
437
|
+
# df.select(Polars.col("foo").str.strip)
|
438
|
+
# # =>
|
439
|
+
# # shape: (3, 1)
|
440
|
+
# # ┌───────┐
|
441
|
+
# # │ foo │
|
442
|
+
# # │ --- │
|
443
|
+
# # │ str │
|
444
|
+
# # ╞═══════╡
|
445
|
+
# # │ lead │
|
446
|
+
# # │ trail │
|
447
|
+
# # │ both │
|
448
|
+
# # └───────┘
|
449
|
+
def strip_chars(characters = nil)
|
450
|
+
characters = Utils.parse_into_expression(characters, str_as_lit: true)
|
451
|
+
Utils.wrap_expr(_rbexpr.str_strip_chars(characters))
|
452
|
+
end
|
453
|
+
alias_method :strip, :strip_chars
|
454
|
+
|
455
|
+
# Remove leading whitespace.
|
456
|
+
#
|
457
|
+
# @param characters [String, nil]
|
458
|
+
# An optional single character that should be trimmed.
|
459
|
+
#
|
460
|
+
# @return [Expr]
|
461
|
+
#
|
462
|
+
# @example
|
463
|
+
# df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
|
464
|
+
# df.select(Polars.col("foo").str.lstrip)
|
465
|
+
# # =>
|
466
|
+
# # shape: (3, 1)
|
467
|
+
# # ┌────────┐
|
468
|
+
# # │ foo │
|
469
|
+
# # │ --- │
|
470
|
+
# # │ str │
|
471
|
+
# # ╞════════╡
|
472
|
+
# # │ lead │
|
473
|
+
# # │ trail │
|
474
|
+
# # │ both │
|
475
|
+
# # └────────┘
|
476
|
+
def strip_chars_start(characters = nil)
|
477
|
+
characters = Utils.parse_into_expression(characters, str_as_lit: true)
|
478
|
+
Utils.wrap_expr(_rbexpr.str_strip_chars_start(characters))
|
479
|
+
end
|
480
|
+
alias_method :lstrip, :strip_chars_start
|
481
|
+
|
482
|
+
# Remove trailing whitespace.
|
483
|
+
#
|
484
|
+
# @param characters [String, nil]
|
485
|
+
# An optional single character that should be trimmed.
|
486
|
+
#
|
487
|
+
# @return [Expr]
|
488
|
+
#
|
489
|
+
# @example
|
490
|
+
# df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
|
491
|
+
# df.select(Polars.col("foo").str.rstrip)
|
492
|
+
# # =>
|
493
|
+
# # shape: (3, 1)
|
494
|
+
# # ┌───────┐
|
495
|
+
# # │ foo │
|
496
|
+
# # │ --- │
|
497
|
+
# # │ str │
|
498
|
+
# # ╞═══════╡
|
499
|
+
# # │ lead │
|
500
|
+
# # │ trail │
|
501
|
+
# # │ both │
|
502
|
+
# # └───────┘
|
503
|
+
def strip_chars_end(characters = nil)
|
504
|
+
characters = Utils.parse_into_expression(characters, str_as_lit: true)
|
505
|
+
Utils.wrap_expr(_rbexpr.str_strip_chars_end(characters))
|
506
|
+
end
|
507
|
+
alias_method :rstrip, :strip_chars_end
|
508
|
+
|
509
|
+
# Remove prefix.
|
510
|
+
#
|
511
|
+
# The prefix will be removed from the string exactly once, if found.
|
512
|
+
#
|
513
|
+
# @param prefix [String]
|
514
|
+
# The prefix to be removed.
|
515
|
+
#
|
516
|
+
# @return [Expr]
|
517
|
+
#
|
518
|
+
# @example
|
519
|
+
# df = Polars::DataFrame.new({"a" => ["foobar", "foofoobar", "foo", "bar"]})
|
520
|
+
# df.with_columns(Polars.col("a").str.strip_prefix("foo").alias("stripped"))
|
521
|
+
# # =>
|
522
|
+
# # shape: (4, 2)
|
523
|
+
# # ┌───────────┬──────────┐
|
524
|
+
# # │ a ┆ stripped │
|
525
|
+
# # │ --- ┆ --- │
|
526
|
+
# # │ str ┆ str │
|
527
|
+
# # ╞═══════════╪══════════╡
|
528
|
+
# # │ foobar ┆ bar │
|
529
|
+
# # │ foofoobar ┆ foobar │
|
530
|
+
# # │ foo ┆ │
|
531
|
+
# # │ bar ┆ bar │
|
532
|
+
# # └───────────┴──────────┘
|
533
|
+
def strip_prefix(prefix)
|
534
|
+
prefix = Utils.parse_into_expression(prefix, str_as_lit: true)
|
535
|
+
Utils.wrap_expr(_rbexpr.str_strip_prefix(prefix))
|
536
|
+
end
|
537
|
+
|
538
|
+
# Remove suffix.
|
539
|
+
#
|
540
|
+
# The suffix will be removed from the string exactly once, if found.
|
541
|
+
#
|
542
|
+
#
|
543
|
+
# @param suffix [String]
|
544
|
+
# The suffix to be removed.
|
545
|
+
#
|
546
|
+
# @return [Expr]
|
547
|
+
#
|
548
|
+
# @example
|
549
|
+
# df = Polars::DataFrame.new({"a" => ["foobar", "foobarbar", "foo", "bar"]})
|
550
|
+
# df.with_columns(Polars.col("a").str.strip_suffix("bar").alias("stripped"))
|
551
|
+
# # =>
|
552
|
+
# # shape: (4, 2)
|
553
|
+
# # ┌───────────┬──────────┐
|
554
|
+
# # │ a ┆ stripped │
|
555
|
+
# # │ --- ┆ --- │
|
556
|
+
# # │ str ┆ str │
|
557
|
+
# # ╞═══════════╪══════════╡
|
558
|
+
# # │ foobar ┆ foo │
|
559
|
+
# # │ foobarbar ┆ foobar │
|
560
|
+
# # │ foo ┆ foo │
|
561
|
+
# # │ bar ┆ │
|
562
|
+
# # └───────────┴──────────┘
|
563
|
+
def strip_suffix(suffix)
|
564
|
+
suffix = Utils.parse_into_expression(suffix, str_as_lit: true)
|
565
|
+
Utils.wrap_expr(_rbexpr.str_strip_suffix(suffix))
|
566
|
+
end
|
567
|
+
|
568
|
+
# Pad the start of the string until it reaches the given length.
|
569
|
+
#
|
570
|
+
# @param length [Integer]
|
571
|
+
# Pad the string until it reaches this length. Strings with length equal to
|
572
|
+
# or greater than this value are returned as-is.
|
573
|
+
# @param fill_char [String]
|
574
|
+
# The character to pad the string with.
|
575
|
+
#
|
576
|
+
# @return [Expr]
|
577
|
+
#
|
578
|
+
# @example
|
579
|
+
# df = Polars::DataFrame.new({"a": ["cow", "monkey", "hippopotamus", nil]})
|
580
|
+
# df.with_columns(padded: Polars.col("a").str.pad_start(8, "*"))
|
581
|
+
# # =>
|
582
|
+
# # shape: (4, 2)
|
583
|
+
# # ┌──────────────┬──────────────┐
|
584
|
+
# # │ a ┆ padded │
|
585
|
+
# # │ --- ┆ --- │
|
586
|
+
# # │ str ┆ str │
|
587
|
+
# # ╞══════════════╪══════════════╡
|
588
|
+
# # │ cow ┆ *****cow │
|
589
|
+
# # │ monkey ┆ **monkey │
|
590
|
+
# # │ hippopotamus ┆ hippopotamus │
|
591
|
+
# # │ null ┆ null │
|
592
|
+
# # └──────────────┴──────────────┘
|
593
|
+
def pad_start(length, fill_char = " ")
|
594
|
+
Utils.wrap_expr(_rbexpr.str_pad_start(length, fill_char))
|
595
|
+
end
|
596
|
+
alias_method :rjust, :pad_start
|
597
|
+
|
598
|
+
# Pad the end of the string until it reaches the given length.
|
599
|
+
#
|
600
|
+
# @param length [Integer]
|
601
|
+
# Pad the string until it reaches this length. Strings with length equal to
|
602
|
+
# or greater than this value are returned as-is.
|
603
|
+
# @param fill_char [String]
|
604
|
+
# The character to pad the string with.
|
605
|
+
#
|
606
|
+
# @return [Expr]
|
607
|
+
#
|
608
|
+
# @example
|
609
|
+
# df = Polars::DataFrame.new({"a": ["cow", "monkey", "hippopotamus", nil]})
|
610
|
+
# df.with_columns(padded: Polars.col("a").str.pad_end(8, "*"))
|
611
|
+
# # =>
|
612
|
+
# # shape: (4, 2)
|
613
|
+
# # ┌──────────────┬──────────────┐
|
614
|
+
# # │ a ┆ padded │
|
615
|
+
# # │ --- ┆ --- │
|
616
|
+
# # │ str ┆ str │
|
617
|
+
# # ╞══════════════╪══════════════╡
|
618
|
+
# # │ cow ┆ cow***** │
|
619
|
+
# # │ monkey ┆ monkey** │
|
620
|
+
# # │ hippopotamus ┆ hippopotamus │
|
621
|
+
# # │ null ┆ null │
|
622
|
+
# # └──────────────┴──────────────┘
|
623
|
+
def pad_end(length, fill_char = " ")
|
624
|
+
Utils.wrap_expr(_rbexpr.str_pad_end(length, fill_char))
|
625
|
+
end
|
626
|
+
alias_method :ljust, :pad_end
|
627
|
+
|
628
|
+
# Fills the string with zeroes.
|
629
|
+
#
|
630
|
+
# Return a copy of the string left filled with ASCII '0' digits to make a string
|
631
|
+
# of length width.
|
632
|
+
#
|
633
|
+
# A leading sign prefix ('+'/'-') is handled by inserting the padding after the
|
634
|
+
# sign character rather than before. The original string is returned if width is
|
635
|
+
# less than or equal to `s.length`.
|
636
|
+
#
|
637
|
+
# @param length [Integer]
|
638
|
+
# Fill the value up to this length
|
639
|
+
#
|
640
|
+
# @return [Expr]
|
641
|
+
#
|
642
|
+
# @example
|
643
|
+
# df = Polars::DataFrame.new({"a" => [-1, 123, 999999, nil]})
|
644
|
+
# df.with_columns(Polars.col("a").cast(Polars::String).str.zfill(4).alias("zfill"))
|
645
|
+
# # =>
|
646
|
+
# # shape: (4, 2)
|
647
|
+
# # ┌────────┬────────┐
|
648
|
+
# # │ a ┆ zfill │
|
649
|
+
# # │ --- ┆ --- │
|
650
|
+
# # │ i64 ┆ str │
|
651
|
+
# # ╞════════╪════════╡
|
652
|
+
# # │ -1 ┆ -001 │
|
653
|
+
# # │ 123 ┆ 0123 │
|
654
|
+
# # │ 999999 ┆ 999999 │
|
655
|
+
# # │ null ┆ null │
|
656
|
+
# # └────────┴────────┘
|
657
|
+
def zfill(length)
|
658
|
+
length = Utils.parse_into_expression(length)
|
659
|
+
Utils.wrap_expr(_rbexpr.str_zfill(length))
|
660
|
+
end
|
661
|
+
|
662
|
+
# Check if string contains a substring that matches a regex.
|
663
|
+
#
|
664
|
+
# @param pattern [String]
|
665
|
+
# A valid regex pattern.
|
666
|
+
# @param literal [Boolean]
|
667
|
+
# Treat pattern as a literal string.
|
668
|
+
#
|
669
|
+
# @return [Expr]
|
670
|
+
#
|
671
|
+
# @example
|
672
|
+
# df = Polars::DataFrame.new({"a" => ["Crab", "cat and dog", "rab$bit", nil]})
|
673
|
+
# df.select(
|
674
|
+
# [
|
675
|
+
# Polars.col("a"),
|
676
|
+
# Polars.col("a").str.contains("cat|bit").alias("regex"),
|
677
|
+
# Polars.col("a").str.contains("rab$", literal: true).alias("literal")
|
678
|
+
# ]
|
679
|
+
# )
|
680
|
+
# # =>
|
681
|
+
# # shape: (4, 3)
|
682
|
+
# # ┌─────────────┬───────┬─────────┐
|
683
|
+
# # │ a ┆ regex ┆ literal │
|
684
|
+
# # │ --- ┆ --- ┆ --- │
|
685
|
+
# # │ str ┆ bool ┆ bool │
|
686
|
+
# # ╞═════════════╪═══════╪═════════╡
|
687
|
+
# # │ Crab ┆ false ┆ false │
|
688
|
+
# # │ cat and dog ┆ true ┆ false │
|
689
|
+
# # │ rab$bit ┆ true ┆ true │
|
690
|
+
# # │ null ┆ null ┆ null │
|
691
|
+
# # └─────────────┴───────┴─────────┘
|
692
|
+
def contains(pattern, literal: false, strict: true)
|
693
|
+
pattern = Utils.parse_into_expression(pattern, str_as_lit: true)
|
694
|
+
Utils.wrap_expr(_rbexpr.str_contains(pattern, literal, strict))
|
695
|
+
end
|
696
|
+
|
697
|
+
# Check if string values end with a substring.
|
698
|
+
#
|
699
|
+
# @param sub [String]
|
700
|
+
# Suffix substring.
|
701
|
+
#
|
702
|
+
# @return [Expr]
|
703
|
+
#
|
704
|
+
# @example
|
705
|
+
# df = Polars::DataFrame.new({"fruits" => ["apple", "mango", nil]})
|
706
|
+
# df.with_column(
|
707
|
+
# Polars.col("fruits").str.ends_with("go").alias("has_suffix")
|
708
|
+
# )
|
709
|
+
# # =>
|
710
|
+
# # shape: (3, 2)
|
711
|
+
# # ┌────────┬────────────┐
|
712
|
+
# # │ fruits ┆ has_suffix │
|
713
|
+
# # │ --- ┆ --- │
|
714
|
+
# # │ str ┆ bool │
|
715
|
+
# # ╞════════╪════════════╡
|
716
|
+
# # │ apple ┆ false │
|
717
|
+
# # │ mango ┆ true │
|
718
|
+
# # │ null ┆ null │
|
719
|
+
# # └────────┴────────────┘
|
720
|
+
#
|
721
|
+
# @example Using `ends_with` as a filter condition:
|
722
|
+
# df.filter(Polars.col("fruits").str.ends_with("go"))
|
723
|
+
# # =>
|
724
|
+
# # shape: (1, 1)
|
725
|
+
# # ┌────────┐
|
726
|
+
# # │ fruits │
|
727
|
+
# # │ --- │
|
728
|
+
# # │ str │
|
729
|
+
# # ╞════════╡
|
730
|
+
# # │ mango │
|
731
|
+
# # └────────┘
|
732
|
+
def ends_with(sub)
|
733
|
+
sub = Utils.parse_into_expression(sub, str_as_lit: true)
|
734
|
+
Utils.wrap_expr(_rbexpr.str_ends_with(sub))
|
735
|
+
end
|
736
|
+
|
737
|
+
# Check if string values start with a substring.
|
738
|
+
#
|
739
|
+
# @param sub [String]
|
740
|
+
# Prefix substring.
|
741
|
+
#
|
742
|
+
# @return [Expr]
|
743
|
+
#
|
744
|
+
# @example
|
745
|
+
# df = Polars::DataFrame.new({"fruits" => ["apple", "mango", nil]})
|
746
|
+
# df.with_column(
|
747
|
+
# Polars.col("fruits").str.starts_with("app").alias("has_prefix")
|
748
|
+
# )
|
749
|
+
# # =>
|
750
|
+
# # shape: (3, 2)
|
751
|
+
# # ┌────────┬────────────┐
|
752
|
+
# # │ fruits ┆ has_prefix │
|
753
|
+
# # │ --- ┆ --- │
|
754
|
+
# # │ str ┆ bool │
|
755
|
+
# # ╞════════╪════════════╡
|
756
|
+
# # │ apple ┆ true │
|
757
|
+
# # │ mango ┆ false │
|
758
|
+
# # │ null ┆ null │
|
759
|
+
# # └────────┴────────────┘
|
760
|
+
#
|
761
|
+
# @example Using `starts_with` as a filter condition:
|
762
|
+
# df.filter(Polars.col("fruits").str.starts_with("app"))
|
763
|
+
# # =>
|
764
|
+
# # shape: (1, 1)
|
765
|
+
# # ┌────────┐
|
766
|
+
# # │ fruits │
|
767
|
+
# # │ --- │
|
768
|
+
# # │ str │
|
769
|
+
# # ╞════════╡
|
770
|
+
# # │ apple │
|
771
|
+
# # └────────┘
|
772
|
+
def starts_with(sub)
|
773
|
+
sub = Utils.parse_into_expression(sub, str_as_lit: true)
|
774
|
+
Utils.wrap_expr(_rbexpr.str_starts_with(sub))
|
775
|
+
end
|
776
|
+
|
777
|
+
# Parse string values as JSON.
|
778
|
+
#
|
779
|
+
# Throw errors if encounter invalid JSON strings.
|
780
|
+
#
|
781
|
+
# @param dtype [Object]
|
782
|
+
# The dtype to cast the extracted value to. If nil, the dtype will be
|
783
|
+
# inferred from the JSON value.
|
784
|
+
#
|
785
|
+
# @return [Expr]
|
786
|
+
#
|
787
|
+
# @example
|
788
|
+
# df = Polars::DataFrame.new(
|
789
|
+
# {"json" => ['{"a":1, "b": true}', nil, '{"a":2, "b": false}']}
|
790
|
+
# )
|
791
|
+
# dtype = Polars::Struct.new([Polars::Field.new("a", Polars::Int64), Polars::Field.new("b", Polars::Boolean)])
|
792
|
+
# df.select(Polars.col("json").str.json_decode(dtype))
|
793
|
+
# # =>
|
794
|
+
# # shape: (3, 1)
|
795
|
+
# # ┌───────────┐
|
796
|
+
# # │ json │
|
797
|
+
# # │ --- │
|
798
|
+
# # │ struct[2] │
|
799
|
+
# # ╞═══════════╡
|
800
|
+
# # │ {1,true} │
|
801
|
+
# # │ null │
|
802
|
+
# # │ {2,false} │
|
803
|
+
# # └───────────┘
|
804
|
+
def json_decode(dtype = nil, infer_schema_length: 100)
|
805
|
+
if !dtype.nil?
|
806
|
+
dtype = Utils.rb_type_to_dtype(dtype)
|
807
|
+
end
|
808
|
+
Utils.wrap_expr(_rbexpr.str_json_decode(dtype, infer_schema_length))
|
809
|
+
end
|
810
|
+
alias_method :json_extract, :json_decode
|
811
|
+
|
812
|
+
# Extract the first match of json string with provided JSONPath expression.
|
813
|
+
#
|
814
|
+
# Throw errors if encounter invalid json strings.
|
815
|
+
# All return value will be casted to Utf8 regardless of the original value.
|
816
|
+
#
|
817
|
+
# Documentation on JSONPath standard can be found
|
818
|
+
# [here](https://goessner.net/articles/JsonPath/).
|
819
|
+
#
|
820
|
+
# @param json_path [String]
|
821
|
+
# A valid JSON path query string.
|
822
|
+
#
|
823
|
+
# @return [Expr]
|
824
|
+
#
|
825
|
+
# @example
|
826
|
+
# df = Polars::DataFrame.new(
|
827
|
+
# {"json_val" => ['{"a":"1"}', nil, '{"a":2}', '{"a":2.1}', '{"a":true}']}
|
828
|
+
# )
|
829
|
+
# df.select(Polars.col("json_val").str.json_path_match("$.a"))
|
830
|
+
# # =>
|
831
|
+
# # shape: (5, 1)
|
832
|
+
# # ┌──────────┐
|
833
|
+
# # │ json_val │
|
834
|
+
# # │ --- │
|
835
|
+
# # │ str │
|
836
|
+
# # ╞══════════╡
|
837
|
+
# # │ 1 │
|
838
|
+
# # │ null │
|
839
|
+
# # │ 2 │
|
840
|
+
# # │ 2.1 │
|
841
|
+
# # │ true │
|
842
|
+
# # └──────────┘
|
843
|
+
def json_path_match(json_path)
|
844
|
+
json_path = Utils.parse_into_expression(json_path, str_as_lit: true)
|
845
|
+
Utils.wrap_expr(_rbexpr.str_json_path_match(json_path))
|
846
|
+
end
|
847
|
+
|
848
|
+
# Decode a value using the provided encoding.
|
849
|
+
#
|
850
|
+
# @param encoding ["hex", "base64"]
|
851
|
+
# The encoding to use.
|
852
|
+
# @param strict [Boolean]
|
853
|
+
# How to handle invalid inputs:
|
854
|
+
#
|
855
|
+
# - `true`: An error will be thrown if unable to decode a value.
|
856
|
+
# - `false`: Unhandled values will be replaced with `nil`.
|
857
|
+
#
|
858
|
+
# @return [Expr]
|
859
|
+
#
|
860
|
+
# @example
|
861
|
+
# df = Polars::DataFrame.new({"encoded" => ["666f6f", "626172", nil]})
|
862
|
+
# df.select(Polars.col("encoded").str.decode("hex"))
|
863
|
+
# # =>
|
864
|
+
# # shape: (3, 1)
|
865
|
+
# # ┌─────────┐
|
866
|
+
# # │ encoded │
|
867
|
+
# # │ --- │
|
868
|
+
# # │ binary │
|
869
|
+
# # ╞═════════╡
|
870
|
+
# # │ b"foo" │
|
871
|
+
# # │ b"bar" │
|
872
|
+
# # │ null │
|
873
|
+
# # └─────────┘
|
874
|
+
def decode(encoding, strict: true)
|
875
|
+
if encoding == "hex"
|
876
|
+
Utils.wrap_expr(_rbexpr.str_hex_decode(strict))
|
877
|
+
elsif encoding == "base64"
|
878
|
+
Utils.wrap_expr(_rbexpr.str_base64_decode(strict))
|
879
|
+
else
|
880
|
+
raise ArgumentError, "encoding must be one of {{'hex', 'base64'}}, got #{encoding}"
|
881
|
+
end
|
882
|
+
end
|
883
|
+
|
884
|
+
# Encode a value using the provided encoding.
|
885
|
+
#
|
886
|
+
# @param encoding ["hex", "base64"]
|
887
|
+
# The encoding to use.
|
888
|
+
#
|
889
|
+
# @return [Expr]
|
890
|
+
#
|
891
|
+
# @example
|
892
|
+
# df = Polars::DataFrame.new({"strings" => ["foo", "bar", nil]})
|
893
|
+
# df.select(Polars.col("strings").str.encode("hex"))
|
894
|
+
# # =>
|
895
|
+
# # shape: (3, 1)
|
896
|
+
# # ┌─────────┐
|
897
|
+
# # │ strings │
|
898
|
+
# # │ --- │
|
899
|
+
# # │ str │
|
900
|
+
# # ╞═════════╡
|
901
|
+
# # │ 666f6f │
|
902
|
+
# # │ 626172 │
|
903
|
+
# # │ null │
|
904
|
+
# # └─────────┘
|
905
|
+
def encode(encoding)
|
906
|
+
if encoding == "hex"
|
907
|
+
Utils.wrap_expr(_rbexpr.str_hex_encode)
|
908
|
+
elsif encoding == "base64"
|
909
|
+
Utils.wrap_expr(_rbexpr.str_base64_encode)
|
910
|
+
else
|
911
|
+
raise ArgumentError, "encoding must be one of {{'hex', 'base64'}}, got #{encoding}"
|
912
|
+
end
|
913
|
+
end
|
914
|
+
|
915
|
+
# Extract the target capture group from provided patterns.
|
916
|
+
#
|
917
|
+
# @param pattern [String]
|
918
|
+
# A valid regex pattern
|
919
|
+
# @param group_index [Integer]
|
920
|
+
# Index of the targeted capture group.
|
921
|
+
# Group 0 mean the whole pattern, first group begin at index 1
|
922
|
+
# Default to the first capture group
|
923
|
+
#
|
924
|
+
# @return [Expr]
|
925
|
+
#
|
926
|
+
# @example
|
927
|
+
# df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
|
928
|
+
# df.select(
|
929
|
+
# [
|
930
|
+
# Polars.col("foo").str.extract('(\d+)')
|
931
|
+
# ]
|
932
|
+
# )
|
933
|
+
# # =>
|
934
|
+
# # shape: (2, 1)
|
935
|
+
# # ┌─────┐
|
936
|
+
# # │ foo │
|
937
|
+
# # │ --- │
|
938
|
+
# # │ str │
|
939
|
+
# # ╞═════╡
|
940
|
+
# # │ 123 │
|
941
|
+
# # │ 678 │
|
942
|
+
# # └─────┘
|
943
|
+
def extract(pattern, group_index: 1)
|
944
|
+
pattern = Utils.parse_into_expression(pattern, str_as_lit: true)
|
945
|
+
Utils.wrap_expr(_rbexpr.str_extract(pattern, group_index))
|
946
|
+
end
|
947
|
+
|
948
|
+
# Extracts all matches for the given regex pattern.
|
949
|
+
#
|
950
|
+
# Extracts each successive non-overlapping regex match in an individual string as
|
951
|
+
# an array.
|
952
|
+
#
|
953
|
+
# @param pattern [String]
|
954
|
+
# A valid regex pattern
|
955
|
+
#
|
956
|
+
# @return [Expr]
|
957
|
+
#
|
958
|
+
# @example
|
959
|
+
# df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
|
960
|
+
# df.select(
|
961
|
+
# [
|
962
|
+
# Polars.col("foo").str.extract_all('(\d+)').alias("extracted_nrs")
|
963
|
+
# ]
|
964
|
+
# )
|
965
|
+
# # =>
|
966
|
+
# # shape: (2, 1)
|
967
|
+
# # ┌────────────────┐
|
968
|
+
# # │ extracted_nrs │
|
969
|
+
# # │ --- │
|
970
|
+
# # │ list[str] │
|
971
|
+
# # ╞════════════════╡
|
972
|
+
# # │ ["123", "45"] │
|
973
|
+
# # │ ["678", "910"] │
|
974
|
+
# # └────────────────┘
|
975
|
+
def extract_all(pattern)
|
976
|
+
pattern = Utils.parse_into_expression(pattern, str_as_lit: true)
|
977
|
+
Utils.wrap_expr(_rbexpr.str_extract_all(pattern))
|
978
|
+
end
|
979
|
+
|
980
|
+
# Extract all capture groups for the given regex pattern.
|
981
|
+
#
|
982
|
+
# @param pattern [String]
|
983
|
+
# A valid regular expression pattern containing at least one capture group,
|
984
|
+
# compatible with the [regex crate](https://docs.rs/regex/latest/regex/).
|
985
|
+
#
|
986
|
+
# @return [Expr]
|
987
|
+
#
|
988
|
+
# @example
|
989
|
+
# df = Polars::DataFrame.new(
|
990
|
+
# {
|
991
|
+
# "url": [
|
992
|
+
# "http://vote.com/ballon_dor?candidate=messi&ref=python",
|
993
|
+
# "http://vote.com/ballon_dor?candidate=weghorst&ref=polars",
|
994
|
+
# "http://vote.com/ballon_dor?error=404&ref=rust"
|
995
|
+
# ]
|
996
|
+
# }
|
997
|
+
# )
|
998
|
+
# pattern = /candidate=(?<candidate>\w+)&ref=(?<ref>\w+)/.to_s
|
999
|
+
# df.select(captures: Polars.col("url").str.extract_groups(pattern)).unnest(
|
1000
|
+
# "captures"
|
1001
|
+
# )
|
1002
|
+
# # =>
|
1003
|
+
# # shape: (3, 2)
|
1004
|
+
# # ┌───────────┬────────┐
|
1005
|
+
# # │ candidate ┆ ref │
|
1006
|
+
# # │ --- ┆ --- │
|
1007
|
+
# # │ str ┆ str │
|
1008
|
+
# # ╞═══════════╪════════╡
|
1009
|
+
# # │ messi ┆ python │
|
1010
|
+
# # │ weghorst ┆ polars │
|
1011
|
+
# # │ null ┆ null │
|
1012
|
+
# # └───────────┴────────┘
|
1013
|
+
#
|
1014
|
+
# @example Unnamed groups have their numerical position converted to a string:
|
1015
|
+
# pattern = /candidate=(\w+)&ref=(\w+)/.to_s
|
1016
|
+
# (
|
1017
|
+
# df.with_columns(
|
1018
|
+
# captures: Polars.col("url").str.extract_groups(pattern)
|
1019
|
+
# ).with_columns(name: Polars.col("captures").struct["1"].str.to_uppercase)
|
1020
|
+
# )
|
1021
|
+
# # =>
|
1022
|
+
# # shape: (3, 3)
|
1023
|
+
# # ┌─────────────────────────────────┬───────────────────────┬──────────┐
|
1024
|
+
# # │ url ┆ captures ┆ name │
|
1025
|
+
# # │ --- ┆ --- ┆ --- │
|
1026
|
+
# # │ str ┆ struct[2] ┆ str │
|
1027
|
+
# # ╞═════════════════════════════════╪═══════════════════════╪══════════╡
|
1028
|
+
# # │ http://vote.com/ballon_dor?can… ┆ {"messi","python"} ┆ MESSI │
|
1029
|
+
# # │ http://vote.com/ballon_dor?can… ┆ {"weghorst","polars"} ┆ WEGHORST │
|
1030
|
+
# # │ http://vote.com/ballon_dor?err… ┆ {null,null} ┆ null │
|
1031
|
+
# # └─────────────────────────────────┴───────────────────────┴──────────┘
|
1032
|
+
def extract_groups(pattern)
|
1033
|
+
Utils.wrap_expr(_rbexpr.str_extract_groups(pattern))
|
1034
|
+
end
|
1035
|
+
|
1036
|
+
# Count all successive non-overlapping regex matches.
|
1037
|
+
#
|
1038
|
+
# @param pattern [String]
|
1039
|
+
# A valid regex pattern
|
1040
|
+
#
|
1041
|
+
# @return [Expr]
|
1042
|
+
#
|
1043
|
+
# @example
|
1044
|
+
# df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
|
1045
|
+
# df.select(
|
1046
|
+
# [
|
1047
|
+
# Polars.col("foo").str.count_match('\d').alias("count_digits")
|
1048
|
+
# ]
|
1049
|
+
# )
|
1050
|
+
# # =>
|
1051
|
+
# # shape: (2, 1)
|
1052
|
+
# # ┌──────────────┐
|
1053
|
+
# # │ count_digits │
|
1054
|
+
# # │ --- │
|
1055
|
+
# # │ u32 │
|
1056
|
+
# # ╞══════════════╡
|
1057
|
+
# # │ 5 │
|
1058
|
+
# # │ 6 │
|
1059
|
+
# # └──────────────┘
|
1060
|
+
def count_matches(pattern, literal: false)
|
1061
|
+
pattern = Utils.parse_into_expression(pattern, str_as_lit: true)
|
1062
|
+
Utils.wrap_expr(_rbexpr.str_count_matches(pattern, literal))
|
1063
|
+
end
|
1064
|
+
alias_method :count_match, :count_matches
|
1065
|
+
|
1066
|
+
# Split the string by a substring.
|
1067
|
+
#
|
1068
|
+
# @param by [String]
|
1069
|
+
# Substring to split by.
|
1070
|
+
# @param inclusive [Boolean]
|
1071
|
+
# If true, include the split character/string in the results.
|
1072
|
+
#
|
1073
|
+
# @return [Expr]
|
1074
|
+
#
|
1075
|
+
# @example
|
1076
|
+
# df = Polars::DataFrame.new({"s" => ["foo bar", "foo-bar", "foo bar baz"]})
|
1077
|
+
# df.select(Polars.col("s").str.split(" "))
|
1078
|
+
# # =>
|
1079
|
+
# # shape: (3, 1)
|
1080
|
+
# # ┌───────────────────────┐
|
1081
|
+
# # │ s │
|
1082
|
+
# # │ --- │
|
1083
|
+
# # │ list[str] │
|
1084
|
+
# # ╞═══════════════════════╡
|
1085
|
+
# # │ ["foo", "bar"] │
|
1086
|
+
# # │ ["foo-bar"] │
|
1087
|
+
# # │ ["foo", "bar", "baz"] │
|
1088
|
+
# # └───────────────────────┘
|
1089
|
+
def split(by, inclusive: false)
|
1090
|
+
by = Utils.parse_into_expression(by, str_as_lit: true)
|
1091
|
+
if inclusive
|
1092
|
+
return Utils.wrap_expr(_rbexpr.str_split_inclusive(by))
|
1093
|
+
end
|
1094
|
+
Utils.wrap_expr(_rbexpr.str_split(by))
|
1095
|
+
end
|
1096
|
+
|
1097
|
+
# Split the string by a substring using `n` splits.
|
1098
|
+
#
|
1099
|
+
# Results in a struct of `n+1` fields.
|
1100
|
+
#
|
1101
|
+
# If it cannot make `n` splits, the remaining field elements will be null.
|
1102
|
+
#
|
1103
|
+
# @param by [String]
|
1104
|
+
# Substring to split by.
|
1105
|
+
# @param n [Integer]
|
1106
|
+
# Number of splits to make.
|
1107
|
+
# @param inclusive [Boolean]
|
1108
|
+
# If true, include the split character/string in the results.
|
1109
|
+
#
|
1110
|
+
# @return [Expr]
|
1111
|
+
#
|
1112
|
+
# @example
|
1113
|
+
# df = Polars::DataFrame.new({"x" => ["a_1", nil, "c", "d_4"]})
|
1114
|
+
# df.select(
|
1115
|
+
# [
|
1116
|
+
# Polars.col("x").str.split_exact("_", 1).alias("fields")
|
1117
|
+
# ]
|
1118
|
+
# )
|
1119
|
+
# # =>
|
1120
|
+
# # shape: (4, 1)
|
1121
|
+
# # ┌─────────────┐
|
1122
|
+
# # │ fields │
|
1123
|
+
# # │ --- │
|
1124
|
+
# # │ struct[2] │
|
1125
|
+
# # ╞═════════════╡
|
1126
|
+
# # │ {"a","1"} │
|
1127
|
+
# # │ {null,null} │
|
1128
|
+
# # │ {"c",null} │
|
1129
|
+
# # │ {"d","4"} │
|
1130
|
+
# # └─────────────┘
|
1131
|
+
def split_exact(by, n, inclusive: false)
|
1132
|
+
by = Utils.parse_into_expression(by, str_as_lit: true)
|
1133
|
+
if inclusive
|
1134
|
+
Utils.wrap_expr(_rbexpr.str_split_exact_inclusive(by, n))
|
1135
|
+
else
|
1136
|
+
Utils.wrap_expr(_rbexpr.str_split_exact(by, n))
|
1137
|
+
end
|
1138
|
+
end
|
1139
|
+
|
1140
|
+
# Split the string by a substring, restricted to returning at most `n` items.
|
1141
|
+
#
|
1142
|
+
# If the number of possible splits is less than `n-1`, the remaining field
|
1143
|
+
# elements will be null. If the number of possible splits is `n-1` or greater,
|
1144
|
+
# the last (nth) substring will contain the remainder of the string.
|
1145
|
+
#
|
1146
|
+
# @param by [String]
|
1147
|
+
# Substring to split by.
|
1148
|
+
# @param n [Integer]
|
1149
|
+
# Max number of items to return.
|
1150
|
+
#
|
1151
|
+
# @return [Expr]
|
1152
|
+
#
|
1153
|
+
# @example
|
1154
|
+
# df = Polars::DataFrame.new({"s" => ["foo bar", nil, "foo-bar", "foo bar baz"]})
|
1155
|
+
# df.select(Polars.col("s").str.splitn(" ", 2).alias("fields"))
|
1156
|
+
# # =>
|
1157
|
+
# # shape: (4, 1)
|
1158
|
+
# # ┌───────────────────┐
|
1159
|
+
# # │ fields │
|
1160
|
+
# # │ --- │
|
1161
|
+
# # │ struct[2] │
|
1162
|
+
# # ╞═══════════════════╡
|
1163
|
+
# # │ {"foo","bar"} │
|
1164
|
+
# # │ {null,null} │
|
1165
|
+
# # │ {"foo-bar",null} │
|
1166
|
+
# # │ {"foo","bar baz"} │
|
1167
|
+
# # └───────────────────┘
|
1168
|
+
def splitn(by, n)
|
1169
|
+
by = Utils.parse_into_expression(by, str_as_lit: true)
|
1170
|
+
Utils.wrap_expr(_rbexpr.str_splitn(by, n))
|
1171
|
+
end
|
1172
|
+
|
1173
|
+
# Replace first matching regex/literal substring with a new string value.
|
1174
|
+
#
|
1175
|
+
# @param pattern [String]
|
1176
|
+
# Regex pattern.
|
1177
|
+
# @param value [String]
|
1178
|
+
# Replacement string.
|
1179
|
+
# @param literal [Boolean]
|
1180
|
+
# Treat pattern as a literal string.
|
1181
|
+
#
|
1182
|
+
# @return [Expr]
|
1183
|
+
#
|
1184
|
+
# @example
|
1185
|
+
# df = Polars::DataFrame.new({"id" => [1, 2], "text" => ["123abc", "abc456"]})
|
1186
|
+
# df.with_column(
|
1187
|
+
# Polars.col("text").str.replace('abc\b', "ABC")
|
1188
|
+
# )
|
1189
|
+
# # =>
|
1190
|
+
# # shape: (2, 2)
|
1191
|
+
# # ┌─────┬────────┐
|
1192
|
+
# # │ id ┆ text │
|
1193
|
+
# # │ --- ┆ --- │
|
1194
|
+
# # │ i64 ┆ str │
|
1195
|
+
# # ╞═════╪════════╡
|
1196
|
+
# # │ 1 ┆ 123ABC │
|
1197
|
+
# # │ 2 ┆ abc456 │
|
1198
|
+
# # └─────┴────────┘
|
1199
|
+
def replace(pattern, value, literal: false, n: 1)
|
1200
|
+
pattern = Utils.parse_into_expression(pattern, str_as_lit: true)
|
1201
|
+
value = Utils.parse_into_expression(value, str_as_lit: true)
|
1202
|
+
Utils.wrap_expr(_rbexpr.str_replace_n(pattern, value, literal, n))
|
1203
|
+
end
|
1204
|
+
|
1205
|
+
# Replace all matching regex/literal substrings with a new string value.
|
1206
|
+
#
|
1207
|
+
# @param pattern [String]
|
1208
|
+
# Regex pattern.
|
1209
|
+
# @param value [String]
|
1210
|
+
# Replacement string.
|
1211
|
+
# @param literal [Boolean]
|
1212
|
+
# Treat pattern as a literal string.
|
1213
|
+
#
|
1214
|
+
# @return [Expr]
|
1215
|
+
#
|
1216
|
+
# @example
|
1217
|
+
# df = Polars::DataFrame.new({"id" => [1, 2], "text" => ["abcabc", "123a123"]})
|
1218
|
+
# df.with_column(Polars.col("text").str.replace_all("a", "-"))
|
1219
|
+
# # =>
|
1220
|
+
# # shape: (2, 2)
|
1221
|
+
# # ┌─────┬─────────┐
|
1222
|
+
# # │ id ┆ text │
|
1223
|
+
# # │ --- ┆ --- │
|
1224
|
+
# # │ i64 ┆ str │
|
1225
|
+
# # ╞═════╪═════════╡
|
1226
|
+
# # │ 1 ┆ -bc-bc │
|
1227
|
+
# # │ 2 ┆ 123-123 │
|
1228
|
+
# # └─────┴─────────┘
|
1229
|
+
def replace_all(pattern, value, literal: false)
|
1230
|
+
pattern = Utils.parse_into_expression(pattern, str_as_lit: true)
|
1231
|
+
value = Utils.parse_into_expression(value, str_as_lit: true)
|
1232
|
+
Utils.wrap_expr(_rbexpr.str_replace_all(pattern, value, literal))
|
1233
|
+
end
|
1234
|
+
|
1235
|
+
# Returns string values in reversed order.
|
1236
|
+
#
|
1237
|
+
# @return [Expr]
|
1238
|
+
#
|
1239
|
+
# @example
|
1240
|
+
# df = Polars::DataFrame.new({"text" => ["foo", "bar", "man\u0303ana"]})
|
1241
|
+
# df.with_columns(Polars.col("text").str.reverse.alias("reversed"))
|
1242
|
+
# # =>
|
1243
|
+
# # shape: (3, 2)
|
1244
|
+
# # ┌────────┬──────────┐
|
1245
|
+
# # │ text ┆ reversed │
|
1246
|
+
# # │ --- ┆ --- │
|
1247
|
+
# # │ str ┆ str │
|
1248
|
+
# # ╞════════╪══════════╡
|
1249
|
+
# # │ foo ┆ oof │
|
1250
|
+
# # │ bar ┆ rab │
|
1251
|
+
# # │ mañana ┆ anañam │
|
1252
|
+
# # └────────┴──────────┘
|
1253
|
+
def reverse
|
1254
|
+
Utils.wrap_expr(_rbexpr.str_reverse)
|
1255
|
+
end
|
1256
|
+
|
1257
|
+
# Create subslices of the string values of a Utf8 Series.
|
1258
|
+
#
|
1259
|
+
# @param offset [Integer]
|
1260
|
+
# Start index. Negative indexing is supported.
|
1261
|
+
# @param length [Integer]
|
1262
|
+
# Length of the slice. If set to `nil` (default), the slice is taken to the
|
1263
|
+
# end of the string.
|
1264
|
+
#
|
1265
|
+
# @return [Expr]
|
1266
|
+
#
|
1267
|
+
# @example
|
1268
|
+
# df = Polars::DataFrame.new({"s" => ["pear", nil, "papaya", "dragonfruit"]})
|
1269
|
+
# df.with_column(
|
1270
|
+
# Polars.col("s").str.slice(-3).alias("s_sliced")
|
1271
|
+
# )
|
1272
|
+
# # =>
|
1273
|
+
# # shape: (4, 2)
|
1274
|
+
# # ┌─────────────┬──────────┐
|
1275
|
+
# # │ s ┆ s_sliced │
|
1276
|
+
# # │ --- ┆ --- │
|
1277
|
+
# # │ str ┆ str │
|
1278
|
+
# # ╞═════════════╪══════════╡
|
1279
|
+
# # │ pear ┆ ear │
|
1280
|
+
# # │ null ┆ null │
|
1281
|
+
# # │ papaya ┆ aya │
|
1282
|
+
# # │ dragonfruit ┆ uit │
|
1283
|
+
# # └─────────────┴──────────┘
|
1284
|
+
def slice(offset, length = nil)
|
1285
|
+
offset = Utils.parse_into_expression(offset)
|
1286
|
+
length = Utils.parse_into_expression(length)
|
1287
|
+
Utils.wrap_expr(_rbexpr.str_slice(offset, length))
|
1288
|
+
end
|
1289
|
+
|
1290
|
+
# Convert an Utf8 column into an Int64 column with base radix.
|
1291
|
+
#
|
1292
|
+
# @param base [Integer]
|
1293
|
+
# Positive integer which is the base of the string we are parsing.
|
1294
|
+
# Default: 10.
|
1295
|
+
# @param strict [Boolean]
|
1296
|
+
# Bool, default=true will raise any ParseError or overflow as ComputeError.
|
1297
|
+
# false silently convert to Null.
|
1298
|
+
#
|
1299
|
+
# @return [Expr]
|
1300
|
+
#
|
1301
|
+
# @example
|
1302
|
+
# df = Polars::DataFrame.new({"bin" => ["110", "101", "010", "invalid"]})
|
1303
|
+
# df.with_columns(Polars.col("bin").str.to_integer(base: 2, strict: false).alias("parsed"))
|
1304
|
+
# # =>
|
1305
|
+
# # shape: (4, 2)
|
1306
|
+
# # ┌─────────┬────────┐
|
1307
|
+
# # │ bin ┆ parsed │
|
1308
|
+
# # │ --- ┆ --- │
|
1309
|
+
# # │ str ┆ i64 │
|
1310
|
+
# # ╞═════════╪════════╡
|
1311
|
+
# # │ 110 ┆ 6 │
|
1312
|
+
# # │ 101 ┆ 5 │
|
1313
|
+
# # │ 010 ┆ 2 │
|
1314
|
+
# # │ invalid ┆ null │
|
1315
|
+
# # └─────────┴────────┘
|
1316
|
+
#
|
1317
|
+
# @example
|
1318
|
+
# df = Polars::DataFrame.new({"hex" => ["fa1e", "ff00", "cafe", nil]})
|
1319
|
+
# df.with_columns(Polars.col("hex").str.to_integer(base: 16, strict: true).alias("parsed"))
|
1320
|
+
# # =>
|
1321
|
+
# # shape: (4, 2)
|
1322
|
+
# # ┌──────┬────────┐
|
1323
|
+
# # │ hex ┆ parsed │
|
1324
|
+
# # │ --- ┆ --- │
|
1325
|
+
# # │ str ┆ i64 │
|
1326
|
+
# # ╞══════╪════════╡
|
1327
|
+
# # │ fa1e ┆ 64030 │
|
1328
|
+
# # │ ff00 ┆ 65280 │
|
1329
|
+
# # │ cafe ┆ 51966 │
|
1330
|
+
# # │ null ┆ null │
|
1331
|
+
# # └──────┴────────┘
|
1332
|
+
def to_integer(base: 10, strict: true)
|
1333
|
+
base = Utils.parse_into_expression(base, str_as_lit: false)
|
1334
|
+
Utils.wrap_expr(_rbexpr.str_to_integer(base, strict))
|
1335
|
+
end
|
1336
|
+
|
1337
|
+
# Parse integers with base radix from strings.
|
1338
|
+
#
|
1339
|
+
# By default base 2. ParseError/Overflows become Nulls.
|
1340
|
+
#
|
1341
|
+
# @param radix [Integer]
|
1342
|
+
# Positive integer which is the base of the string we are parsing.
|
1343
|
+
# Default: 2.
|
1344
|
+
# @param strict [Boolean]
|
1345
|
+
# Bool, Default=true will raise any ParseError or overflow as ComputeError.
|
1346
|
+
# False silently convert to Null.
|
1347
|
+
#
|
1348
|
+
# @return [Expr]
|
1349
|
+
#
|
1350
|
+
# @example
|
1351
|
+
# df = Polars::DataFrame.new({"bin" => ["110", "101", "010", "invalid"]})
|
1352
|
+
# df.select(Polars.col("bin").str.parse_int(2, strict: false))
|
1353
|
+
# # =>
|
1354
|
+
# # shape: (4, 1)
|
1355
|
+
# # ┌──────┐
|
1356
|
+
# # │ bin │
|
1357
|
+
# # │ --- │
|
1358
|
+
# # │ i32 │
|
1359
|
+
# # ╞══════╡
|
1360
|
+
# # │ 6 │
|
1361
|
+
# # │ 5 │
|
1362
|
+
# # │ 2 │
|
1363
|
+
# # │ null │
|
1364
|
+
# # └──────┘
|
1365
|
+
def parse_int(radix = 2, strict: true)
|
1366
|
+
to_integer(base: 2, strict: strict).cast(Int32, strict: strict)
|
1367
|
+
end
|
1368
|
+
|
1369
|
+
# Use the aho-corasick algorithm to find matches.
|
1370
|
+
#
|
1371
|
+
# This version determines if any of the patterns find a match.
|
1372
|
+
#
|
1373
|
+
# @param patterns [String]
|
1374
|
+
# String patterns to search.
|
1375
|
+
# @param ascii_case_insensitive [Boolean]
|
1376
|
+
# Enable ASCII-aware case insensitive matching.
|
1377
|
+
# When this option is enabled, searching will be performed without respect
|
1378
|
+
# to case for ASCII letters (a-z and A-Z) only.
|
1379
|
+
#
|
1380
|
+
# @return [Expr]
|
1381
|
+
#
|
1382
|
+
# @example
|
1383
|
+
# df = Polars::DataFrame.new(
|
1384
|
+
# {
|
1385
|
+
# "lyrics": [
|
1386
|
+
# "Everybody wants to rule the world",
|
1387
|
+
# "Tell me what you want, what you really really want",
|
1388
|
+
# "Can you feel the love tonight"
|
1389
|
+
# ]
|
1390
|
+
# }
|
1391
|
+
# )
|
1392
|
+
# df.with_columns(
|
1393
|
+
# Polars.col("lyrics").str.contains_any(["you", "me"]).alias("contains_any")
|
1394
|
+
# )
|
1395
|
+
# # =>
|
1396
|
+
# # shape: (3, 2)
|
1397
|
+
# # ┌─────────────────────────────────┬──────────────┐
|
1398
|
+
# # │ lyrics ┆ contains_any │
|
1399
|
+
# # │ --- ┆ --- │
|
1400
|
+
# # │ str ┆ bool │
|
1401
|
+
# # ╞═════════════════════════════════╪══════════════╡
|
1402
|
+
# # │ Everybody wants to rule the wo… ┆ false │
|
1403
|
+
# # │ Tell me what you want, what yo… ┆ true │
|
1404
|
+
# # │ Can you feel the love tonight ┆ true │
|
1405
|
+
# # └─────────────────────────────────┴──────────────┘
|
1406
|
+
def contains_any(patterns, ascii_case_insensitive: false)
|
1407
|
+
patterns = Utils.parse_into_expression(patterns, str_as_lit: false, list_as_series: true)
|
1408
|
+
Utils.wrap_expr(
|
1409
|
+
_rbexpr.str_contains_any(patterns, ascii_case_insensitive)
|
1410
|
+
)
|
1411
|
+
end
|
1412
|
+
|
1413
|
+
# Use the aho-corasick algorithm to replace many matches.
|
1414
|
+
#
|
1415
|
+
# @param patterns [String]
|
1416
|
+
# String patterns to search and replace.
|
1417
|
+
# @param replace_with [String]
|
1418
|
+
# Strings to replace where a pattern was a match.
|
1419
|
+
# This can be broadcasted. So it supports many:one and many:many.
|
1420
|
+
# @param ascii_case_insensitive [Boolean]
|
1421
|
+
# Enable ASCII-aware case insensitive matching.
|
1422
|
+
# When this option is enabled, searching will be performed without respect
|
1423
|
+
# to case for ASCII letters (a-z and A-Z) only.
|
1424
|
+
#
|
1425
|
+
# @return [Expr]
|
1426
|
+
#
|
1427
|
+
# @example
|
1428
|
+
# df = Polars::DataFrame.new(
|
1429
|
+
# {
|
1430
|
+
# "lyrics": [
|
1431
|
+
# "Everybody wants to rule the world",
|
1432
|
+
# "Tell me what you want, what you really really want",
|
1433
|
+
# "Can you feel the love tonight"
|
1434
|
+
# ]
|
1435
|
+
# }
|
1436
|
+
# )
|
1437
|
+
# df.with_columns(
|
1438
|
+
# Polars.col("lyrics")
|
1439
|
+
# .str.replace_many(
|
1440
|
+
# ["me", "you", "they"],
|
1441
|
+
# ""
|
1442
|
+
# )
|
1443
|
+
# .alias("removes_pronouns")
|
1444
|
+
# )
|
1445
|
+
# # =>
|
1446
|
+
# # shape: (3, 2)
|
1447
|
+
# # ┌─────────────────────────────────┬─────────────────────────────────┐
|
1448
|
+
# # │ lyrics ┆ removes_pronouns │
|
1449
|
+
# # │ --- ┆ --- │
|
1450
|
+
# # │ str ┆ str │
|
1451
|
+
# # ╞═════════════════════════════════╪═════════════════════════════════╡
|
1452
|
+
# # │ Everybody wants to rule the wo… ┆ Everybody wants to rule the wo… │
|
1453
|
+
# # │ Tell me what you want, what yo… ┆ Tell what want, what really… │
|
1454
|
+
# # │ Can you feel the love tonight ┆ Can feel the love tonight │
|
1455
|
+
# # └─────────────────────────────────┴─────────────────────────────────┘
|
1456
|
+
#
|
1457
|
+
# @example
|
1458
|
+
# df.with_columns(
|
1459
|
+
# Polars.col("lyrics")
|
1460
|
+
# .str.replace_many(
|
1461
|
+
# ["me", "you"],
|
1462
|
+
# ["you", "me"]
|
1463
|
+
# )
|
1464
|
+
# .alias("confusing")
|
1465
|
+
# )
|
1466
|
+
# # =>
|
1467
|
+
# # shape: (3, 2)
|
1468
|
+
# # ┌─────────────────────────────────┬─────────────────────────────────┐
|
1469
|
+
# # │ lyrics ┆ confusing │
|
1470
|
+
# # │ --- ┆ --- │
|
1471
|
+
# # │ str ┆ str │
|
1472
|
+
# # ╞═════════════════════════════════╪═════════════════════════════════╡
|
1473
|
+
# # │ Everybody wants to rule the wo… ┆ Everybody wants to rule the wo… │
|
1474
|
+
# # │ Tell me what you want, what yo… ┆ Tell you what me want, what me… │
|
1475
|
+
# # │ Can you feel the love tonight ┆ Can me feel the love tonight │
|
1476
|
+
# # └─────────────────────────────────┴─────────────────────────────────┘
|
1477
|
+
def replace_many(patterns, replace_with, ascii_case_insensitive: false)
|
1478
|
+
patterns = Utils.parse_into_expression(patterns, str_as_lit: false, list_as_series: true)
|
1479
|
+
replace_with = Utils.parse_into_expression(
|
1480
|
+
replace_with, str_as_lit: true, list_as_series: true
|
1481
|
+
)
|
1482
|
+
Utils.wrap_expr(
|
1483
|
+
_rbexpr.str_replace_many(
|
1484
|
+
patterns, replace_with, ascii_case_insensitive
|
1485
|
+
)
|
1486
|
+
)
|
1487
|
+
end
|
1488
|
+
|
1489
|
+
private
|
1490
|
+
|
1491
|
+
def _validate_format_argument(format)
|
1492
|
+
# TODO
|
1493
|
+
end
|
1494
|
+
end
|
1495
|
+
end
|