polars-df 0.10.0-x86_64-linux-musl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +3 -0
  3. data/CHANGELOG.md +175 -0
  4. data/Cargo.lock +2536 -0
  5. data/Cargo.toml +6 -0
  6. data/LICENSE-THIRD-PARTY.txt +38726 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +437 -0
  9. data/lib/polars/3.1/polars.so +0 -0
  10. data/lib/polars/3.2/polars.so +0 -0
  11. data/lib/polars/3.3/polars.so +0 -0
  12. data/lib/polars/array_expr.rb +537 -0
  13. data/lib/polars/array_name_space.rb +423 -0
  14. data/lib/polars/batched_csv_reader.rb +98 -0
  15. data/lib/polars/binary_expr.rb +77 -0
  16. data/lib/polars/binary_name_space.rb +66 -0
  17. data/lib/polars/cat_expr.rb +72 -0
  18. data/lib/polars/cat_name_space.rb +125 -0
  19. data/lib/polars/config.rb +530 -0
  20. data/lib/polars/convert.rb +93 -0
  21. data/lib/polars/data_frame.rb +5418 -0
  22. data/lib/polars/data_types.rb +466 -0
  23. data/lib/polars/date_time_expr.rb +1444 -0
  24. data/lib/polars/date_time_name_space.rb +1484 -0
  25. data/lib/polars/dynamic_group_by.rb +52 -0
  26. data/lib/polars/exceptions.rb +31 -0
  27. data/lib/polars/expr.rb +6105 -0
  28. data/lib/polars/expr_dispatch.rb +22 -0
  29. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  30. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  31. data/lib/polars/functions/as_datatype.rb +248 -0
  32. data/lib/polars/functions/col.rb +47 -0
  33. data/lib/polars/functions/eager.rb +182 -0
  34. data/lib/polars/functions/lazy.rb +1280 -0
  35. data/lib/polars/functions/len.rb +49 -0
  36. data/lib/polars/functions/lit.rb +35 -0
  37. data/lib/polars/functions/random.rb +16 -0
  38. data/lib/polars/functions/range/date_range.rb +103 -0
  39. data/lib/polars/functions/range/int_range.rb +51 -0
  40. data/lib/polars/functions/repeat.rb +144 -0
  41. data/lib/polars/functions/whenthen.rb +96 -0
  42. data/lib/polars/functions.rb +57 -0
  43. data/lib/polars/group_by.rb +548 -0
  44. data/lib/polars/io.rb +890 -0
  45. data/lib/polars/lazy_frame.rb +2833 -0
  46. data/lib/polars/lazy_group_by.rb +84 -0
  47. data/lib/polars/list_expr.rb +791 -0
  48. data/lib/polars/list_name_space.rb +445 -0
  49. data/lib/polars/meta_expr.rb +222 -0
  50. data/lib/polars/name_expr.rb +198 -0
  51. data/lib/polars/plot.rb +109 -0
  52. data/lib/polars/rolling_group_by.rb +37 -0
  53. data/lib/polars/series.rb +4527 -0
  54. data/lib/polars/slice.rb +104 -0
  55. data/lib/polars/sql_context.rb +194 -0
  56. data/lib/polars/string_cache.rb +75 -0
  57. data/lib/polars/string_expr.rb +1519 -0
  58. data/lib/polars/string_name_space.rb +810 -0
  59. data/lib/polars/struct_expr.rb +98 -0
  60. data/lib/polars/struct_name_space.rb +96 -0
  61. data/lib/polars/testing.rb +507 -0
  62. data/lib/polars/utils.rb +422 -0
  63. data/lib/polars/version.rb +4 -0
  64. data/lib/polars/whenthen.rb +83 -0
  65. data/lib/polars-df.rb +1 -0
  66. data/lib/polars.rb +72 -0
  67. metadata +125 -0
@@ -0,0 +1,1519 @@
1
+ module Polars
2
+ # Namespace for string related expressions.
3
+ class StringExpr
4
+ # @private
5
+ attr_accessor :_rbexpr
6
+
7
+ # @private
8
+ def initialize(expr)
9
+ self._rbexpr = expr._rbexpr
10
+ end
11
+
12
+ # Convert a Utf8 column into a Date column.
13
+ #
14
+ # @param format [String]
15
+ # Format to use for conversion. Refer to the
16
+ # [chrono crate documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
17
+ # for the full specification. Example: `"%Y-%m-%d"`.
18
+ # If set to nil (default), the format is inferred from the data.
19
+ # @param strict [Boolean]
20
+ # Raise an error if any conversion fails.
21
+ # @param exact [Boolean]
22
+ # Require an exact format match. If false, allow the format to match anywhere
23
+ # in the target string.
24
+ # @param cache [Boolean]
25
+ # Use a cache of unique, converted dates to apply the conversion.
26
+ #
27
+ # @return [Expr]
28
+ #
29
+ # @example
30
+ # s = Polars::Series.new(["2020/01/01", "2020/02/01", "2020/03/01"])
31
+ # s.str.to_date
32
+ # # =>
33
+ # # shape: (3,)
34
+ # # Series: '' [date]
35
+ # # [
36
+ # # 2020-01-01
37
+ # # 2020-02-01
38
+ # # 2020-03-01
39
+ # # ]
40
+ def to_date(format = nil, strict: true, exact: true, cache: true)
41
+ _validate_format_argument(format)
42
+ Utils.wrap_expr(self._rbexpr.str_to_date(format, strict, exact, cache))
43
+ end
44
+
45
+ # Convert a Utf8 column into a Datetime column.
46
+ #
47
+ # @param format [String]
48
+ # Format to use for conversion. Refer to the
49
+ # [chrono crate documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
50
+ # for the full specification. Example: `"%Y-%m-%d %H:%M:%S"`.
51
+ # If set to nil (default), the format is inferred from the data.
52
+ # @param time_unit ["us", "ns", "ms"]
53
+ # Unit of time for the resulting Datetime column. If set to nil (default),
54
+ # the time unit is inferred from the format string if given, eg:
55
+ # `"%F %T%.3f"` => `Datetime("ms")`. If no fractional second component is
56
+ # found, the default is `"us"`.
57
+ # @param time_zone [String]
58
+ # Time zone for the resulting Datetime column.
59
+ # @param strict [Boolean]
60
+ # Raise an error if any conversion fails.
61
+ # @param exact [Boolean]
62
+ # Require an exact format match. If false, allow the format to match anywhere
63
+ # in the target string.
64
+ # @param cache [Boolean]
65
+ # Use a cache of unique, converted datetimes to apply the conversion.
66
+ #
67
+ # @return [Expr]
68
+ #
69
+ # @example
70
+ # s = Polars::Series.new(["2020-01-01 01:00Z", "2020-01-01 02:00Z"])
71
+ # s.str.to_datetime("%Y-%m-%d %H:%M%#z")
72
+ # # =>
73
+ # # shape: (2,)
74
+ # # Series: '' [datetime[μs, UTC]]
75
+ # # [
76
+ # # 2020-01-01 01:00:00 UTC
77
+ # # 2020-01-01 02:00:00 UTC
78
+ # # ]
79
+ def to_datetime(
80
+ format = nil,
81
+ time_unit: nil,
82
+ time_zone: nil,
83
+ strict: true,
84
+ exact: true,
85
+ cache: true,
86
+ use_earliest: nil,
87
+ ambiguous: "raise"
88
+ )
89
+ _validate_format_argument(format)
90
+ ambiguous = Utils.rename_use_earliest_to_ambiguous(use_earliest, ambiguous)
91
+ ambiguous = Polars.lit(ambiguous) unless ambiguous.is_a?(Expr)
92
+ Utils.wrap_expr(
93
+ self._rbexpr.str_to_datetime(
94
+ format,
95
+ time_unit,
96
+ time_zone,
97
+ strict,
98
+ exact,
99
+ cache,
100
+ ambiguous._rbexpr
101
+ )
102
+ )
103
+ end
104
+
105
+ # Convert a Utf8 column into a Time column.
106
+ #
107
+ # @param format [String]
108
+ # Format to use for conversion. Refer to the
109
+ # [chrono crate documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
110
+ # for the full specification. Example: `"%H:%M:%S"`.
111
+ # If set to nil (default), the format is inferred from the data.
112
+ # @param strict [Boolean]
113
+ # Raise an error if any conversion fails.
114
+ # @param cache [Boolean]
115
+ # Use a cache of unique, converted times to apply the conversion.
116
+ #
117
+ # @return [Expr]
118
+ #
119
+ # @example
120
+ # s = Polars::Series.new(["01:00", "02:00", "03:00"])
121
+ # s.str.to_time("%H:%M")
122
+ # # =>
123
+ # # shape: (3,)
124
+ # # Series: '' [time]
125
+ # # [
126
+ # # 01:00:00
127
+ # # 02:00:00
128
+ # # 03:00:00
129
+ # # ]
130
+ def to_time(format = nil, strict: true, cache: true)
131
+ _validate_format_argument(format)
132
+ Utils.wrap_expr(_rbexpr.str_to_time(format, strict, cache))
133
+ end
134
+
135
+ # Parse a Utf8 expression to a Date/Datetime/Time type.
136
+ #
137
+ # @param dtype [Object]
138
+ # The data type to convert into. Can be either Date, Datetime, or Time.
139
+ # @param format [String]
140
+ # Format to use, refer to the
141
+ # [chrono strftime documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
142
+ # for specification. Example: `"%y-%m-%d"`.
143
+ # @param strict [Boolean]
144
+ # Raise an error if any conversion fails.
145
+ # @param exact [Boolean]
146
+ # - If true, require an exact format match.
147
+ # - If false, allow the format to match anywhere in the target string.
148
+ # @param utc [Boolean]
149
+ # Parse timezone aware datetimes as UTC. This may be useful if you have data
150
+ # with mixed offsets.
151
+ #
152
+ # @return [Expr]
153
+ #
154
+ # @note
155
+ # When parsing a Datetime the column precision will be inferred from
156
+ # the format string, if given, eg: "%F %T%.3f" => Datetime("ms"). If
157
+ # no fractional second component is found then the default is "us".
158
+ #
159
+ # @example Dealing with a consistent format:
160
+ # s = Polars::Series.new(["2020-01-01 01:00Z", "2020-01-01 02:00Z"])
161
+ # s.str.strptime(Polars::Datetime, "%Y-%m-%d %H:%M%#z")
162
+ # # =>
163
+ # # shape: (2,)
164
+ # # Series: '' [datetime[μs, UTC]]
165
+ # # [
166
+ # # 2020-01-01 01:00:00 UTC
167
+ # # 2020-01-01 02:00:00 UTC
168
+ # # ]
169
+ #
170
+ # @example Dealing with different formats.
171
+ # s = Polars::Series.new(
172
+ # "date",
173
+ # [
174
+ # "2021-04-22",
175
+ # "2022-01-04 00:00:00",
176
+ # "01/31/22",
177
+ # "Sun Jul 8 00:34:60 2001",
178
+ # ]
179
+ # )
180
+ # s.to_frame.select(
181
+ # Polars.coalesce(
182
+ # Polars.col("date").str.strptime(Polars::Date, "%F", strict: false),
183
+ # Polars.col("date").str.strptime(Polars::Date, "%F %T", strict: false),
184
+ # Polars.col("date").str.strptime(Polars::Date, "%D", strict: false),
185
+ # Polars.col("date").str.strptime(Polars::Date, "%c", strict: false)
186
+ # )
187
+ # ).to_series
188
+ # # =>
189
+ # # shape: (4,)
190
+ # # Series: 'date' [date]
191
+ # # [
192
+ # # 2021-04-22
193
+ # # 2022-01-04
194
+ # # 2022-01-31
195
+ # # 2001-07-08
196
+ # # ]
197
+ def strptime(dtype, format = nil, strict: true, exact: true, cache: true, utc: false)
198
+ _validate_format_argument(format)
199
+
200
+ if dtype == Date
201
+ to_date(format, strict: strict, exact: exact, cache: cache)
202
+ elsif dtype == Datetime || dtype.is_a?(Datetime)
203
+ dtype = Datetime.new if dtype == Datetime
204
+ time_unit = dtype.time_unit
205
+ time_zone = dtype.time_zone
206
+ to_datetime(format, time_unit: time_unit, time_zone: time_zone, strict: strict, exact: exact, cache: cache)
207
+ elsif dtype == Time
208
+ to_time(format, strict: strict, cache: cache)
209
+ else
210
+ raise ArgumentError, "dtype should be of type {Date, Datetime, Time}"
211
+ end
212
+ end
213
+
214
+ # Convert a String column into a Decimal column.
215
+ #
216
+ # This method infers the needed parameters `precision` and `scale`.
217
+ #
218
+ # @param inference_length [Integer]
219
+ # Number of elements to parse to determine the `precision` and `scale`.
220
+ #
221
+ # @return [Expr]
222
+ #
223
+ # @example
224
+ # df = Polars::DataFrame.new(
225
+ # {
226
+ # "numbers": [
227
+ # "40.12",
228
+ # "3420.13",
229
+ # "120134.19",
230
+ # "3212.98",
231
+ # "12.90",
232
+ # "143.09",
233
+ # "143.9"
234
+ # ]
235
+ # }
236
+ # )
237
+ # df.with_columns(numbers_decimal: Polars.col("numbers").str.to_decimal)
238
+ # # =>
239
+ # # shape: (7, 2)
240
+ # # ┌───────────┬─────────────────┐
241
+ # # │ numbers ┆ numbers_decimal │
242
+ # # │ --- ┆ --- │
243
+ # # │ str ┆ decimal[*,2] │
244
+ # # ╞═══════════╪═════════════════╡
245
+ # # │ 40.12 ┆ 40.12 │
246
+ # # │ 3420.13 ┆ 3420.13 │
247
+ # # │ 120134.19 ┆ 120134.19 │
248
+ # # │ 3212.98 ┆ 3212.98 │
249
+ # # │ 12.90 ┆ 12.90 │
250
+ # # │ 143.09 ┆ 143.09 │
251
+ # # │ 143.9 ┆ 143.90 │
252
+ # # └───────────┴─────────────────┘
253
+ def to_decimal(inference_length = 100)
254
+ Utils.wrap_expr(_rbexpr.str_to_decimal(inference_length))
255
+ end
256
+
257
+ # Get length of the strings as `:u32` (as number of bytes).
258
+ #
259
+ # @return [Expr]
260
+ #
261
+ # @note
262
+ # The returned lengths are equal to the number of bytes in the UTF8 string. If you
263
+ # need the length in terms of the number of characters, use `n_chars` instead.
264
+ #
265
+ # @example
266
+ # df = Polars::DataFrame.new({"s" => ["Café", nil, "345", "東京"]}).with_columns(
267
+ # [
268
+ # Polars.col("s").str.len_bytes.alias("length"),
269
+ # Polars.col("s").str.len_chars.alias("nchars")
270
+ # ]
271
+ # )
272
+ # df
273
+ # # =>
274
+ # # shape: (4, 3)
275
+ # # ┌──────┬────────┬────────┐
276
+ # # │ s ┆ length ┆ nchars │
277
+ # # │ --- ┆ --- ┆ --- │
278
+ # # │ str ┆ u32 ┆ u32 │
279
+ # # ╞══════╪════════╪════════╡
280
+ # # │ Café ┆ 5 ┆ 4 │
281
+ # # │ null ┆ null ┆ null │
282
+ # # │ 345 ┆ 3 ┆ 3 │
283
+ # # │ 東京 ┆ 6 ┆ 2 │
284
+ # # └──────┴────────┴────────┘
285
+ def len_bytes
286
+ Utils.wrap_expr(_rbexpr.str_len_bytes)
287
+ end
288
+ alias_method :lengths, :len_bytes
289
+
290
+ # Get length of the strings as `:u32` (as number of chars).
291
+ #
292
+ # @return [Expr]
293
+ #
294
+ # @note
295
+ # If you know that you are working with ASCII text, `lengths` will be
296
+ # equivalent, and faster (returns length in terms of the number of bytes).
297
+ #
298
+ # @example
299
+ # df = Polars::DataFrame.new({"s" => ["Café", nil, "345", "東京"]}).with_columns(
300
+ # [
301
+ # Polars.col("s").str.len_bytes.alias("length"),
302
+ # Polars.col("s").str.len_chars.alias("nchars")
303
+ # ]
304
+ # )
305
+ # df
306
+ # # =>
307
+ # # shape: (4, 3)
308
+ # # ┌──────┬────────┬────────┐
309
+ # # │ s ┆ length ┆ nchars │
310
+ # # │ --- ┆ --- ┆ --- │
311
+ # # │ str ┆ u32 ┆ u32 │
312
+ # # ╞══════╪════════╪════════╡
313
+ # # │ Café ┆ 5 ┆ 4 │
314
+ # # │ null ┆ null ┆ null │
315
+ # # │ 345 ┆ 3 ┆ 3 │
316
+ # # │ 東京 ┆ 6 ┆ 2 │
317
+ # # └──────┴────────┴────────┘
318
+ def len_chars
319
+ Utils.wrap_expr(_rbexpr.str_len_chars)
320
+ end
321
+ alias_method :n_chars, :len_chars
322
+
323
+ # Vertically concat the values in the Series to a single string value.
324
+ #
325
+ # @param delimiter [String]
326
+ # The delimiter to insert between consecutive string values.
327
+ # @param ignore_nulls [Boolean]
328
+ # Ignore null values (default).
329
+ #
330
+ # @return [Expr]
331
+ #
332
+ # @example
333
+ # df = Polars::DataFrame.new({"foo" => [1, nil, 2]})
334
+ # df.select(Polars.col("foo").str.concat("-"))
335
+ # # =>
336
+ # # shape: (1, 1)
337
+ # # ┌─────┐
338
+ # # │ foo │
339
+ # # │ --- │
340
+ # # │ str │
341
+ # # ╞═════╡
342
+ # # │ 1-2 │
343
+ # # └─────┘
344
+ #
345
+ # @example
346
+ # df = Polars::DataFrame.new({"foo" => [1, nil, 2]})
347
+ # df.select(Polars.col("foo").str.concat("-", ignore_nulls: false))
348
+ # # =>
349
+ # # shape: (1, 1)
350
+ # # ┌──────┐
351
+ # # │ foo │
352
+ # # │ --- │
353
+ # # │ str │
354
+ # # ╞══════╡
355
+ # # │ null │
356
+ # # └──────┘
357
+ def concat(delimiter = "-", ignore_nulls: true)
358
+ Utils.wrap_expr(_rbexpr.str_concat(delimiter, ignore_nulls))
359
+ end
360
+
361
+ # Transform to uppercase variant.
362
+ #
363
+ # @return [Expr]
364
+ #
365
+ # @example
366
+ # df = Polars::DataFrame.new({"foo" => ["cat", "dog"]})
367
+ # df.select(Polars.col("foo").str.to_uppercase)
368
+ # # =>
369
+ # # shape: (2, 1)
370
+ # # ┌─────┐
371
+ # # │ foo │
372
+ # # │ --- │
373
+ # # │ str │
374
+ # # ╞═════╡
375
+ # # │ CAT │
376
+ # # │ DOG │
377
+ # # └─────┘
378
+ def to_uppercase
379
+ Utils.wrap_expr(_rbexpr.str_to_uppercase)
380
+ end
381
+
382
+ # Transform to lowercase variant.
383
+ #
384
+ # @return [Expr]
385
+ #
386
+ # @example
387
+ # df = Polars::DataFrame.new({"foo" => ["CAT", "DOG"]})
388
+ # df.select(Polars.col("foo").str.to_lowercase)
389
+ # # =>
390
+ # # shape: (2, 1)
391
+ # # ┌─────┐
392
+ # # │ foo │
393
+ # # │ --- │
394
+ # # │ str │
395
+ # # ╞═════╡
396
+ # # │ cat │
397
+ # # │ dog │
398
+ # # └─────┘
399
+ def to_lowercase
400
+ Utils.wrap_expr(_rbexpr.str_to_lowercase)
401
+ end
402
+
403
+ # Transform to titlecase variant.
404
+ #
405
+ # @return [Expr]
406
+ #
407
+ # @example
408
+ # df = Polars::DataFrame.new(
409
+ # {"sing": ["welcome to my world", "THERE'S NO TURNING BACK"]}
410
+ # )
411
+ # df.with_columns(foo_title: Polars.col("sing").str.to_titlecase)
412
+ # # =>
413
+ # # shape: (2, 2)
414
+ # # ┌─────────────────────────┬─────────────────────────┐
415
+ # # │ sing ┆ foo_title │
416
+ # # │ --- ┆ --- │
417
+ # # │ str ┆ str │
418
+ # # ╞═════════════════════════╪═════════════════════════╡
419
+ # # │ welcome to my world ┆ Welcome To My World │
420
+ # # │ THERE'S NO TURNING BACK ┆ There's No Turning Back │
421
+ # # └─────────────────────────┴─────────────────────────┘
422
+ def to_titlecase
423
+ raise Todo
424
+ Utils.wrap_expr(_rbexpr.str_to_titlecase)
425
+ end
426
+
427
+ # Remove leading and trailing whitespace.
428
+ #
429
+ # @param characters [String, nil]
430
+ # An optional single character that should be trimmed.
431
+ #
432
+ # @return [Expr]
433
+ #
434
+ # @example
435
+ # df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
436
+ # df.select(Polars.col("foo").str.strip)
437
+ # # =>
438
+ # # shape: (3, 1)
439
+ # # ┌───────┐
440
+ # # │ foo │
441
+ # # │ --- │
442
+ # # │ str │
443
+ # # ╞═══════╡
444
+ # # │ lead │
445
+ # # │ trail │
446
+ # # │ both │
447
+ # # └───────┘
448
+ def strip_chars(characters = nil)
449
+ characters = Utils.parse_as_expression(characters, str_as_lit: true)
450
+ Utils.wrap_expr(_rbexpr.str_strip_chars(characters))
451
+ end
452
+ alias_method :strip, :strip_chars
453
+
454
+ # Remove leading whitespace.
455
+ #
456
+ # @param characters [String, nil]
457
+ # An optional single character that should be trimmed.
458
+ #
459
+ # @return [Expr]
460
+ #
461
+ # @example
462
+ # df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
463
+ # df.select(Polars.col("foo").str.lstrip)
464
+ # # =>
465
+ # # shape: (3, 1)
466
+ # # ┌────────┐
467
+ # # │ foo │
468
+ # # │ --- │
469
+ # # │ str │
470
+ # # ╞════════╡
471
+ # # │ lead │
472
+ # # │ trail │
473
+ # # │ both │
474
+ # # └────────┘
475
+ def strip_chars_start(characters = nil)
476
+ characters = Utils.parse_as_expression(characters, str_as_lit: true)
477
+ Utils.wrap_expr(_rbexpr.str_strip_chars_start(characters))
478
+ end
479
+ alias_method :lstrip, :strip_chars_start
480
+
481
+ # Remove trailing whitespace.
482
+ #
483
+ # @param characters [String, nil]
484
+ # An optional single character that should be trimmed.
485
+ #
486
+ # @return [Expr]
487
+ #
488
+ # @example
489
+ # df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
490
+ # df.select(Polars.col("foo").str.rstrip)
491
+ # # =>
492
+ # # shape: (3, 1)
493
+ # # ┌───────┐
494
+ # # │ foo │
495
+ # # │ --- │
496
+ # # │ str │
497
+ # # ╞═══════╡
498
+ # # │ lead │
499
+ # # │ trail │
500
+ # # │ both │
501
+ # # └───────┘
502
+ def strip_chars_end(characters = nil)
503
+ characters = Utils.parse_as_expression(characters, str_as_lit: true)
504
+ Utils.wrap_expr(_rbexpr.str_strip_chars_end(characters))
505
+ end
506
+ alias_method :rstrip, :strip_chars_end
507
+
508
+ # Remove prefix.
509
+ #
510
+ # The prefix will be removed from the string exactly once, if found.
511
+ #
512
+ # @param prefix [String]
513
+ # The prefix to be removed.
514
+ #
515
+ # @return [Expr]
516
+ #
517
+ # @example
518
+ # df = Polars::DataFrame.new({"a" => ["foobar", "foofoobar", "foo", "bar"]})
519
+ # df.with_columns(Polars.col("a").str.strip_prefix("foo").alias("stripped"))
520
+ # # =>
521
+ # # shape: (4, 2)
522
+ # # ┌───────────┬──────────┐
523
+ # # │ a ┆ stripped │
524
+ # # │ --- ┆ --- │
525
+ # # │ str ┆ str │
526
+ # # ╞═══════════╪══════════╡
527
+ # # │ foobar ┆ bar │
528
+ # # │ foofoobar ┆ foobar │
529
+ # # │ foo ┆ │
530
+ # # │ bar ┆ bar │
531
+ # # └───────────┴──────────┘
532
+ def strip_prefix(prefix)
533
+ prefix = Utils.parse_as_expression(prefix, str_as_lit: true)
534
+ Utils.wrap_expr(_rbexpr.str_strip_prefix(prefix))
535
+ end
536
+
537
+ # Remove suffix.
538
+ #
539
+ # The suffix will be removed from the string exactly once, if found.
540
+ #
541
+ #
542
+ # @param suffix [String]
543
+ # The suffix to be removed.
544
+ #
545
+ # @return [Expr]
546
+ #
547
+ # @example
548
+ # df = Polars::DataFrame.new({"a" => ["foobar", "foobarbar", "foo", "bar"]})
549
+ # df.with_columns(Polars.col("a").str.strip_suffix("bar").alias("stripped"))
550
+ # # =>
551
+ # # shape: (4, 2)
552
+ # # ┌───────────┬──────────┐
553
+ # # │ a ┆ stripped │
554
+ # # │ --- ┆ --- │
555
+ # # │ str ┆ str │
556
+ # # ╞═══════════╪══════════╡
557
+ # # │ foobar ┆ foo │
558
+ # # │ foobarbar ┆ foobar │
559
+ # # │ foo ┆ foo │
560
+ # # │ bar ┆ │
561
+ # # └───────────┴──────────┘
562
+ def strip_suffix(suffix)
563
+ suffix = Utils.parse_as_expression(suffix, str_as_lit: true)
564
+ Utils.wrap_expr(_rbexpr.str_strip_suffix(suffix))
565
+ end
566
+
567
+ # Pad the start of the string until it reaches the given length.
568
+ #
569
+ # @param length [Integer]
570
+ # Pad the string until it reaches this length. Strings with length equal to
571
+ # or greater than this value are returned as-is.
572
+ # @param fill_char [String]
573
+ # The character to pad the string with.
574
+ #
575
+ # @return [Expr]
576
+ #
577
+ # @example
578
+ # df = Polars::DataFrame.new({"a": ["cow", "monkey", "hippopotamus", nil]})
579
+ # df.with_columns(padded: Polars.col("a").str.pad_start(8, "*"))
580
+ # # =>
581
+ # # shape: (4, 2)
582
+ # # ┌──────────────┬──────────────┐
583
+ # # │ a ┆ padded │
584
+ # # │ --- ┆ --- │
585
+ # # │ str ┆ str │
586
+ # # ╞══════════════╪══════════════╡
587
+ # # │ cow ┆ *****cow │
588
+ # # │ monkey ┆ **monkey │
589
+ # # │ hippopotamus ┆ hippopotamus │
590
+ # # │ null ┆ null │
591
+ # # └──────────────┴──────────────┘
592
+ def pad_start(length, fill_char = " ")
593
+ Utils.wrap_expr(_rbexpr.str_pad_start(length, fill_char))
594
+ end
595
+ alias_method :rjust, :pad_start
596
+
597
+ # Pad the end of the string until it reaches the given length.
598
+ #
599
+ # @param length [Integer]
600
+ # Pad the string until it reaches this length. Strings with length equal to
601
+ # or greater than this value are returned as-is.
602
+ # @param fill_char [String]
603
+ # The character to pad the string with.
604
+ #
605
+ # @return [Expr]
606
+ #
607
+ # @example
608
+ # df = Polars::DataFrame.new({"a": ["cow", "monkey", "hippopotamus", nil]})
609
+ # df.with_columns(padded: Polars.col("a").str.pad_end(8, "*"))
610
+ # # =>
611
+ # # shape: (4, 2)
612
+ # # ┌──────────────┬──────────────┐
613
+ # # │ a ┆ padded │
614
+ # # │ --- ┆ --- │
615
+ # # │ str ┆ str │
616
+ # # ╞══════════════╪══════════════╡
617
+ # # │ cow ┆ cow***** │
618
+ # # │ monkey ┆ monkey** │
619
+ # # │ hippopotamus ┆ hippopotamus │
620
+ # # │ null ┆ null │
621
+ # # └──────────────┴──────────────┘
622
+ def pad_end(length, fill_char = " ")
623
+ Utils.wrap_expr(_rbexpr.str_pad_end(length, fill_char))
624
+ end
625
+ alias_method :ljust, :pad_end
626
+
627
+ # Fills the string with zeroes.
628
+ #
629
+ # Return a copy of the string left filled with ASCII '0' digits to make a string
630
+ # of length width.
631
+ #
632
+ # A leading sign prefix ('+'/'-') is handled by inserting the padding after the
633
+ # sign character rather than before. The original string is returned if width is
634
+ # less than or equal to `s.length`.
635
+ #
636
+ # @param length [Integer]
637
+ # Fill the value up to this length
638
+ #
639
+ # @return [Expr]
640
+ #
641
+ # @example
642
+ # df = Polars::DataFrame.new({"a" => [-1, 123, 999999, nil]})
643
+ # df.with_columns(Polars.col("a").cast(Polars::String).str.zfill(4).alias("zfill"))
644
+ # # =>
645
+ # # shape: (4, 2)
646
+ # # ┌────────┬────────┐
647
+ # # │ a ┆ zfill │
648
+ # # │ --- ┆ --- │
649
+ # # │ i64 ┆ str │
650
+ # # ╞════════╪════════╡
651
+ # # │ -1 ┆ -001 │
652
+ # # │ 123 ┆ 0123 │
653
+ # # │ 999999 ┆ 999999 │
654
+ # # │ null ┆ null │
655
+ # # └────────┴────────┘
656
+ def zfill(length)
657
+ length = Utils.parse_as_expression(length)
658
+ Utils.wrap_expr(_rbexpr.str_zfill(length))
659
+ end
660
+
661
+ # Check if string contains a substring that matches a regex.
662
+ #
663
+ # @param pattern [String]
664
+ # A valid regex pattern.
665
+ # @param literal [Boolean]
666
+ # Treat pattern as a literal string.
667
+ #
668
+ # @return [Expr]
669
+ #
670
+ # @example
671
+ # df = Polars::DataFrame.new({"a" => ["Crab", "cat and dog", "rab$bit", nil]})
672
+ # df.select(
673
+ # [
674
+ # Polars.col("a"),
675
+ # Polars.col("a").str.contains("cat|bit").alias("regex"),
676
+ # Polars.col("a").str.contains("rab$", literal: true).alias("literal")
677
+ # ]
678
+ # )
679
+ # # =>
680
+ # # shape: (4, 3)
681
+ # # ┌─────────────┬───────┬─────────┐
682
+ # # │ a ┆ regex ┆ literal │
683
+ # # │ --- ┆ --- ┆ --- │
684
+ # # │ str ┆ bool ┆ bool │
685
+ # # ╞═════════════╪═══════╪═════════╡
686
+ # # │ Crab ┆ false ┆ false │
687
+ # # │ cat and dog ┆ true ┆ false │
688
+ # # │ rab$bit ┆ true ┆ true │
689
+ # # │ null ┆ null ┆ null │
690
+ # # └─────────────┴───────┴─────────┘
691
+ def contains(pattern, literal: false, strict: true)
692
+ pattern = Utils.expr_to_lit_or_expr(pattern, str_to_lit: true)._rbexpr
693
+ Utils.wrap_expr(_rbexpr.str_contains(pattern, literal, strict))
694
+ end
695
+
696
+ # Check if string values end with a substring.
697
+ #
698
+ # @param sub [String]
699
+ # Suffix substring.
700
+ #
701
+ # @return [Expr]
702
+ #
703
+ # @example
704
+ # df = Polars::DataFrame.new({"fruits" => ["apple", "mango", nil]})
705
+ # df.with_column(
706
+ # Polars.col("fruits").str.ends_with("go").alias("has_suffix")
707
+ # )
708
+ # # =>
709
+ # # shape: (3, 2)
710
+ # # ┌────────┬────────────┐
711
+ # # │ fruits ┆ has_suffix │
712
+ # # │ --- ┆ --- │
713
+ # # │ str ┆ bool │
714
+ # # ╞════════╪════════════╡
715
+ # # │ apple ┆ false │
716
+ # # │ mango ┆ true │
717
+ # # │ null ┆ null │
718
+ # # └────────┴────────────┘
719
+ #
720
+ # @example Using `ends_with` as a filter condition:
721
+ # df.filter(Polars.col("fruits").str.ends_with("go"))
722
+ # # =>
723
+ # # shape: (1, 1)
724
+ # # ┌────────┐
725
+ # # │ fruits │
726
+ # # │ --- │
727
+ # # │ str │
728
+ # # ╞════════╡
729
+ # # │ mango │
730
+ # # └────────┘
731
+ def ends_with(sub)
732
+ sub = Utils.expr_to_lit_or_expr(sub, str_to_lit: true)._rbexpr
733
+ Utils.wrap_expr(_rbexpr.str_ends_with(sub))
734
+ end
735
+
736
+ # Check if string values start with a substring.
737
+ #
738
+ # @param sub [String]
739
+ # Prefix substring.
740
+ #
741
+ # @return [Expr]
742
+ #
743
+ # @example
744
+ # df = Polars::DataFrame.new({"fruits" => ["apple", "mango", nil]})
745
+ # df.with_column(
746
+ # Polars.col("fruits").str.starts_with("app").alias("has_prefix")
747
+ # )
748
+ # # =>
749
+ # # shape: (3, 2)
750
+ # # ┌────────┬────────────┐
751
+ # # │ fruits ┆ has_prefix │
752
+ # # │ --- ┆ --- │
753
+ # # │ str ┆ bool │
754
+ # # ╞════════╪════════════╡
755
+ # # │ apple ┆ true │
756
+ # # │ mango ┆ false │
757
+ # # │ null ┆ null │
758
+ # # └────────┴────────────┘
759
+ #
760
+ # @example Using `starts_with` as a filter condition:
761
+ # df.filter(Polars.col("fruits").str.starts_with("app"))
762
+ # # =>
763
+ # # shape: (1, 1)
764
+ # # ┌────────┐
765
+ # # │ fruits │
766
+ # # │ --- │
767
+ # # │ str │
768
+ # # ╞════════╡
769
+ # # │ apple │
770
+ # # └────────┘
771
+ def starts_with(sub)
772
+ sub = Utils.expr_to_lit_or_expr(sub, str_to_lit: true)._rbexpr
773
+ Utils.wrap_expr(_rbexpr.str_starts_with(sub))
774
+ end
775
+
776
+ # Parse string values as JSON.
777
+ #
778
+ # Throw errors if encounter invalid JSON strings.
779
+ #
780
+ # @param dtype [Object]
781
+ # The dtype to cast the extracted value to. If nil, the dtype will be
782
+ # inferred from the JSON value.
783
+ #
784
+ # @return [Expr]
785
+ #
786
+ # @example
787
+ # df = Polars::DataFrame.new(
788
+ # {"json" => ['{"a":1, "b": true}', nil, '{"a":2, "b": false}']}
789
+ # )
790
+ # dtype = Polars::Struct.new([Polars::Field.new("a", Polars::Int64), Polars::Field.new("b", Polars::Boolean)])
791
+ # df.select(Polars.col("json").str.json_decode(dtype))
792
+ # # =>
793
+ # # shape: (3, 1)
794
+ # # ┌─────────────┐
795
+ # # │ json │
796
+ # # │ --- │
797
+ # # │ struct[2] │
798
+ # # ╞═════════════╡
799
+ # # │ {1,true} │
800
+ # # │ {null,null} │
801
+ # # │ {2,false} │
802
+ # # └─────────────┘
803
+ def json_decode(dtype = nil, infer_schema_length: 100)
804
+ if !dtype.nil?
805
+ dtype = Utils.rb_type_to_dtype(dtype)
806
+ end
807
+ Utils.wrap_expr(_rbexpr.str_json_decode(dtype, infer_schema_length))
808
+ end
809
+ alias_method :json_extract, :json_decode
810
+
811
+ # Extract the first match of json string with provided JSONPath expression.
812
+ #
813
+ # Throw errors if encounter invalid json strings.
814
+ # All return value will be casted to Utf8 regardless of the original value.
815
+ #
816
+ # Documentation on JSONPath standard can be found
817
+ # [here](https://goessner.net/articles/JsonPath/).
818
+ #
819
+ # @param json_path [String]
820
+ # A valid JSON path query string.
821
+ #
822
+ # @return [Expr]
823
+ #
824
+ # @example
825
+ # df = Polars::DataFrame.new(
826
+ # {"json_val" => ['{"a":"1"}', nil, '{"a":2}', '{"a":2.1}', '{"a":true}']}
827
+ # )
828
+ # df.select(Polars.col("json_val").str.json_path_match("$.a"))
829
+ # # =>
830
+ # # shape: (5, 1)
831
+ # # ┌──────────┐
832
+ # # │ json_val │
833
+ # # │ --- │
834
+ # # │ str │
835
+ # # ╞══════════╡
836
+ # # │ 1 │
837
+ # # │ null │
838
+ # # │ 2 │
839
+ # # │ 2.1 │
840
+ # # │ true │
841
+ # # └──────────┘
842
+ def json_path_match(json_path)
843
+ Utils.wrap_expr(_rbexpr.str_json_path_match(json_path))
844
+ end
845
+
846
+ # Decode a value using the provided encoding.
847
+ #
848
+ # @param encoding ["hex", "base64"]
849
+ # The encoding to use.
850
+ # @param strict [Boolean]
851
+ # How to handle invalid inputs:
852
+ #
853
+ # - `true`: An error will be thrown if unable to decode a value.
854
+ # - `false`: Unhandled values will be replaced with `nil`.
855
+ #
856
+ # @return [Expr]
857
+ #
858
+ # @example
859
+ # df = Polars::DataFrame.new({"encoded" => ["666f6f", "626172", nil]})
860
+ # df.select(Polars.col("encoded").str.decode("hex"))
861
+ # # =>
862
+ # # shape: (3, 1)
863
+ # # ┌─────────┐
864
+ # # │ encoded │
865
+ # # │ --- │
866
+ # # │ binary │
867
+ # # ╞═════════╡
868
+ # # │ b"foo" │
869
+ # # │ b"bar" │
870
+ # # │ null │
871
+ # # └─────────┘
872
+ def decode(encoding, strict: true)
873
+ if encoding == "hex"
874
+ Utils.wrap_expr(_rbexpr.str_hex_decode(strict))
875
+ elsif encoding == "base64"
876
+ Utils.wrap_expr(_rbexpr.str_base64_decode(strict))
877
+ else
878
+ raise ArgumentError, "encoding must be one of {{'hex', 'base64'}}, got #{encoding}"
879
+ end
880
+ end
881
+
882
+ # Encode a value using the provided encoding.
883
+ #
884
+ # @param encoding ["hex", "base64"]
885
+ # The encoding to use.
886
+ #
887
+ # @return [Expr]
888
+ #
889
+ # @example
890
+ # df = Polars::DataFrame.new({"strings" => ["foo", "bar", nil]})
891
+ # df.select(Polars.col("strings").str.encode("hex"))
892
+ # # =>
893
+ # # shape: (3, 1)
894
+ # # ┌─────────┐
895
+ # # │ strings │
896
+ # # │ --- │
897
+ # # │ str │
898
+ # # ╞═════════╡
899
+ # # │ 666f6f │
900
+ # # │ 626172 │
901
+ # # │ null │
902
+ # # └─────────┘
903
+ def encode(encoding)
904
+ if encoding == "hex"
905
+ Utils.wrap_expr(_rbexpr.str_hex_encode)
906
+ elsif encoding == "base64"
907
+ Utils.wrap_expr(_rbexpr.str_base64_encode)
908
+ else
909
+ raise ArgumentError, "encoding must be one of {{'hex', 'base64'}}, got #{encoding}"
910
+ end
911
+ end
912
+
913
+ # Extract the target capture group from provided patterns.
914
+ #
915
+ # @param pattern [String]
916
+ # A valid regex pattern
917
+ # @param group_index [Integer]
918
+ # Index of the targeted capture group.
919
+ # Group 0 mean the whole pattern, first group begin at index 1
920
+ # Default to the first capture group
921
+ #
922
+ # @return [Expr]
923
+ #
924
+ # @example
925
+ # df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
926
+ # df.select(
927
+ # [
928
+ # Polars.col("foo").str.extract('(\d+)')
929
+ # ]
930
+ # )
931
+ # # =>
932
+ # # shape: (2, 1)
933
+ # # ┌─────┐
934
+ # # │ foo │
935
+ # # │ --- │
936
+ # # │ str │
937
+ # # ╞═════╡
938
+ # # │ 123 │
939
+ # # │ 678 │
940
+ # # └─────┘
941
+ def extract(pattern, group_index: 1)
942
+ pattern = Utils.parse_as_expression(pattern, str_as_lit: true)
943
+ Utils.wrap_expr(_rbexpr.str_extract(pattern, group_index))
944
+ end
945
+
946
+ # Extracts all matches for the given regex pattern.
947
+ #
948
+ # Extracts each successive non-overlapping regex match in an individual string as
949
+ # an array.
950
+ #
951
+ # @param pattern [String]
952
+ # A valid regex pattern
953
+ #
954
+ # @return [Expr]
955
+ #
956
+ # @example
957
+ # df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
958
+ # df.select(
959
+ # [
960
+ # Polars.col("foo").str.extract_all('(\d+)').alias("extracted_nrs")
961
+ # ]
962
+ # )
963
+ # # =>
964
+ # # shape: (2, 1)
965
+ # # ┌────────────────┐
966
+ # # │ extracted_nrs │
967
+ # # │ --- │
968
+ # # │ list[str] │
969
+ # # ╞════════════════╡
970
+ # # │ ["123", "45"] │
971
+ # # │ ["678", "910"] │
972
+ # # └────────────────┘
973
+ def extract_all(pattern)
974
+ pattern = Utils.expr_to_lit_or_expr(pattern, str_to_lit: true)
975
+ Utils.wrap_expr(_rbexpr.str_extract_all(pattern._rbexpr))
976
+ end
977
+
978
+ # Extract all capture groups for the given regex pattern.
979
+ #
980
+ # @param pattern [String]
981
+ # A valid regular expression pattern containing at least one capture group,
982
+ # compatible with the [regex crate](https://docs.rs/regex/latest/regex/).
983
+ #
984
+ # @return [Expr]
985
+ #
986
+ # @example
987
+ # df = Polars::DataFrame.new(
988
+ # {
989
+ # "url": [
990
+ # "http://vote.com/ballon_dor?candidate=messi&ref=python",
991
+ # "http://vote.com/ballon_dor?candidate=weghorst&ref=polars",
992
+ # "http://vote.com/ballon_dor?error=404&ref=rust"
993
+ # ]
994
+ # }
995
+ # )
996
+ # pattern = /candidate=(?<candidate>\w+)&ref=(?<ref>\w+)/.to_s
997
+ # df.select(captures: Polars.col("url").str.extract_groups(pattern)).unnest(
998
+ # "captures"
999
+ # )
1000
+ # # =>
1001
+ # # shape: (3, 2)
1002
+ # # ┌───────────┬────────┐
1003
+ # # │ candidate ┆ ref │
1004
+ # # │ --- ┆ --- │
1005
+ # # │ str ┆ str │
1006
+ # # ╞═══════════╪════════╡
1007
+ # # │ messi ┆ python │
1008
+ # # │ weghorst ┆ polars │
1009
+ # # │ null ┆ null │
1010
+ # # └───────────┴────────┘
1011
+ #
1012
+ # @example Unnamed groups have their numerical position converted to a string:
1013
+ # pattern = /candidate=(\w+)&ref=(\w+)/.to_s
1014
+ # (
1015
+ # df.with_columns(
1016
+ # captures: Polars.col("url").str.extract_groups(pattern)
1017
+ # ).with_columns(name: Polars.col("captures").struct["1"].str.to_uppercase)
1018
+ # )
1019
+ # # =>
1020
+ # # shape: (3, 3)
1021
+ # # ┌───────────────────────────────────┬───────────────────────┬──────────┐
1022
+ # # │ url ┆ captures ┆ name │
1023
+ # # │ --- ┆ --- ┆ --- │
1024
+ # # │ str ┆ struct[2] ┆ str │
1025
+ # # ╞═══════════════════════════════════╪═══════════════════════╪══════════╡
1026
+ # # │ http://vote.com/ballon_dor?candi… ┆ {"messi","python"} ┆ MESSI │
1027
+ # # │ http://vote.com/ballon_dor?candi… ┆ {"weghorst","polars"} ┆ WEGHORST │
1028
+ # # │ http://vote.com/ballon_dor?error… ┆ {null,null} ┆ null │
1029
+ # # └───────────────────────────────────┴───────────────────────┴──────────┘
1030
+ def extract_groups(pattern)
1031
+ Utils.wrap_expr(_rbexpr.str_extract_groups(pattern))
1032
+ end
1033
+
1034
+ # Count all successive non-overlapping regex matches.
1035
+ #
1036
+ # @param pattern [String]
1037
+ # A valid regex pattern
1038
+ #
1039
+ # @return [Expr]
1040
+ #
1041
+ # @example
1042
+ # df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
1043
+ # df.select(
1044
+ # [
1045
+ # Polars.col("foo").str.count_match('\d').alias("count_digits")
1046
+ # ]
1047
+ # )
1048
+ # # =>
1049
+ # # shape: (2, 1)
1050
+ # # ┌──────────────┐
1051
+ # # │ count_digits │
1052
+ # # │ --- │
1053
+ # # │ u32 │
1054
+ # # ╞══════════════╡
1055
+ # # │ 5 │
1056
+ # # │ 6 │
1057
+ # # └──────────────┘
1058
+ def count_matches(pattern, literal: false)
1059
+ pattern = Utils.parse_as_expression(pattern, str_as_lit: true)
1060
+ Utils.wrap_expr(_rbexpr.str_count_matches(pattern, literal))
1061
+ end
1062
+ alias_method :count_match, :count_matches
1063
+
1064
+ # Split the string by a substring.
1065
+ #
1066
+ # @param by [String]
1067
+ # Substring to split by.
1068
+ # @param inclusive [Boolean]
1069
+ # If true, include the split character/string in the results.
1070
+ #
1071
+ # @return [Expr]
1072
+ #
1073
+ # @example
1074
+ # df = Polars::DataFrame.new({"s" => ["foo bar", "foo-bar", "foo bar baz"]})
1075
+ # df.select(Polars.col("s").str.split(" "))
1076
+ # # =>
1077
+ # # shape: (3, 1)
1078
+ # # ┌───────────────────────┐
1079
+ # # │ s │
1080
+ # # │ --- │
1081
+ # # │ list[str] │
1082
+ # # ╞═══════════════════════╡
1083
+ # # │ ["foo", "bar"] │
1084
+ # # │ ["foo-bar"] │
1085
+ # # │ ["foo", "bar", "baz"] │
1086
+ # # └───────────────────────┘
1087
+ def split(by, inclusive: false)
1088
+ by = Utils.parse_as_expression(by, str_as_lit: true)
1089
+ if inclusive
1090
+ Utils.wrap_expr(_rbexpr.str_split_inclusive(by))
1091
+ else
1092
+ Utils.wrap_expr(_rbexpr.str_split(by))
1093
+ end
1094
+ end
1095
+
1096
+ # Split the string by a substring using `n` splits.
1097
+ #
1098
+ # Results in a struct of `n+1` fields.
1099
+ #
1100
+ # If it cannot make `n` splits, the remaining field elements will be null.
1101
+ #
1102
+ # @param by [String]
1103
+ # Substring to split by.
1104
+ # @param n [Integer]
1105
+ # Number of splits to make.
1106
+ # @param inclusive [Boolean]
1107
+ # If true, include the split character/string in the results.
1108
+ #
1109
+ # @return [Expr]
1110
+ #
1111
+ # @example
1112
+ # df = Polars::DataFrame.new({"x" => ["a_1", nil, "c", "d_4"]})
1113
+ # df.select(
1114
+ # [
1115
+ # Polars.col("x").str.split_exact("_", 1).alias("fields")
1116
+ # ]
1117
+ # )
1118
+ # # =>
1119
+ # # shape: (4, 1)
1120
+ # # ┌─────────────┐
1121
+ # # │ fields │
1122
+ # # │ --- │
1123
+ # # │ struct[2] │
1124
+ # # ╞═════════════╡
1125
+ # # │ {"a","1"} │
1126
+ # # │ {null,null} │
1127
+ # # │ {"c",null} │
1128
+ # # │ {"d","4"} │
1129
+ # # └─────────────┘
1130
+ def split_exact(by, n, inclusive: false)
1131
+ by = Utils.parse_as_expression(by, str_as_lit: true)
1132
+ if inclusive
1133
+ Utils.wrap_expr(_rbexpr.str_split_exact_inclusive(by, n))
1134
+ else
1135
+ Utils.wrap_expr(_rbexpr.str_split_exact(by, n))
1136
+ end
1137
+ end
1138
+
1139
+ # Split the string by a substring, restricted to returning at most `n` items.
1140
+ #
1141
+ # If the number of possible splits is less than `n-1`, the remaining field
1142
+ # elements will be null. If the number of possible splits is `n-1` or greater,
1143
+ # the last (nth) substring will contain the remainder of the string.
1144
+ #
1145
+ # @param by [String]
1146
+ # Substring to split by.
1147
+ # @param n [Integer]
1148
+ # Max number of items to return.
1149
+ #
1150
+ # @return [Expr]
1151
+ #
1152
+ # @example
1153
+ # df = Polars::DataFrame.new({"s" => ["foo bar", nil, "foo-bar", "foo bar baz"]})
1154
+ # df.select(Polars.col("s").str.splitn(" ", 2).alias("fields"))
1155
+ # # =>
1156
+ # # shape: (4, 1)
1157
+ # # ┌───────────────────┐
1158
+ # # │ fields │
1159
+ # # │ --- │
1160
+ # # │ struct[2] │
1161
+ # # ╞═══════════════════╡
1162
+ # # │ {"foo","bar"} │
1163
+ # # │ {null,null} │
1164
+ # # │ {"foo-bar",null} │
1165
+ # # │ {"foo","bar baz"} │
1166
+ # # └───────────────────┘
1167
+ def splitn(by, n)
1168
+ by = Utils.parse_as_expression(by, str_as_lit: true)
1169
+ Utils.wrap_expr(_rbexpr.str_splitn(by, n))
1170
+ end
1171
+
1172
+ # Replace first matching regex/literal substring with a new string value.
1173
+ #
1174
+ # @param pattern [String]
1175
+ # Regex pattern.
1176
+ # @param value [String]
1177
+ # Replacement string.
1178
+ # @param literal [Boolean]
1179
+ # Treat pattern as a literal string.
1180
+ #
1181
+ # @return [Expr]
1182
+ #
1183
+ # @example
1184
+ # df = Polars::DataFrame.new({"id" => [1, 2], "text" => ["123abc", "abc456"]})
1185
+ # df.with_column(
1186
+ # Polars.col("text").str.replace('abc\b', "ABC")
1187
+ # )
1188
+ # # =>
1189
+ # # shape: (2, 2)
1190
+ # # ┌─────┬────────┐
1191
+ # # │ id ┆ text │
1192
+ # # │ --- ┆ --- │
1193
+ # # │ i64 ┆ str │
1194
+ # # ╞═════╪════════╡
1195
+ # # │ 1 ┆ 123ABC │
1196
+ # # │ 2 ┆ abc456 │
1197
+ # # └─────┴────────┘
1198
+ def replace(pattern, value, literal: false, n: 1)
1199
+ pattern = Utils.expr_to_lit_or_expr(pattern, str_to_lit: true)
1200
+ value = Utils.expr_to_lit_or_expr(value, str_to_lit: true)
1201
+ Utils.wrap_expr(_rbexpr.str_replace_n(pattern._rbexpr, value._rbexpr, literal, n))
1202
+ end
1203
+
1204
+ # Replace all matching regex/literal substrings with a new string value.
1205
+ #
1206
+ # @param pattern [String]
1207
+ # Regex pattern.
1208
+ # @param value [String]
1209
+ # Replacement string.
1210
+ # @param literal [Boolean]
1211
+ # Treat pattern as a literal string.
1212
+ #
1213
+ # @return [Expr]
1214
+ #
1215
+ # @example
1216
+ # df = Polars::DataFrame.new({"id" => [1, 2], "text" => ["abcabc", "123a123"]})
1217
+ # df.with_column(Polars.col("text").str.replace_all("a", "-"))
1218
+ # # =>
1219
+ # # shape: (2, 2)
1220
+ # # ┌─────┬─────────┐
1221
+ # # │ id ┆ text │
1222
+ # # │ --- ┆ --- │
1223
+ # # │ i64 ┆ str │
1224
+ # # ╞═════╪═════════╡
1225
+ # # │ 1 ┆ -bc-bc │
1226
+ # # │ 2 ┆ 123-123 │
1227
+ # # └─────┴─────────┘
1228
+ def replace_all(pattern, value, literal: false)
1229
+ pattern = Utils.expr_to_lit_or_expr(pattern, str_to_lit: true)
1230
+ value = Utils.expr_to_lit_or_expr(value, str_to_lit: true)
1231
+ Utils.wrap_expr(_rbexpr.str_replace_all(pattern._rbexpr, value._rbexpr, literal))
1232
+ end
1233
+
1234
+ # Returns string values in reversed order.
1235
+ #
1236
+ # @return [Expr]
1237
+ #
1238
+ # @example
1239
+ # df = Polars::DataFrame.new({"text" => ["foo", "bar", "man\u0303ana"]})
1240
+ # df.with_columns(Polars.col("text").str.reverse.alias("reversed"))
1241
+ # # =>
1242
+ # # shape: (3, 2)
1243
+ # # ┌────────┬──────────┐
1244
+ # # │ text ┆ reversed │
1245
+ # # │ --- ┆ --- │
1246
+ # # │ str ┆ str │
1247
+ # # ╞════════╪══════════╡
1248
+ # # │ foo ┆ oof │
1249
+ # # │ bar ┆ rab │
1250
+ # # │ mañana ┆ anañam │
1251
+ # # └────────┴──────────┘
1252
+ def reverse
1253
+ Utils.wrap_expr(_rbexpr.str_reverse)
1254
+ end
1255
+
1256
+ # Create subslices of the string values of a Utf8 Series.
1257
+ #
1258
+ # @param offset [Integer]
1259
+ # Start index. Negative indexing is supported.
1260
+ # @param length [Integer]
1261
+ # Length of the slice. If set to `nil` (default), the slice is taken to the
1262
+ # end of the string.
1263
+ #
1264
+ # @return [Expr]
1265
+ #
1266
+ # @example
1267
+ # df = Polars::DataFrame.new({"s" => ["pear", nil, "papaya", "dragonfruit"]})
1268
+ # df.with_column(
1269
+ # Polars.col("s").str.slice(-3).alias("s_sliced")
1270
+ # )
1271
+ # # =>
1272
+ # # shape: (4, 2)
1273
+ # # ┌─────────────┬──────────┐
1274
+ # # │ s ┆ s_sliced │
1275
+ # # │ --- ┆ --- │
1276
+ # # │ str ┆ str │
1277
+ # # ╞═════════════╪══════════╡
1278
+ # # │ pear ┆ ear │
1279
+ # # │ null ┆ null │
1280
+ # # │ papaya ┆ aya │
1281
+ # # │ dragonfruit ┆ uit │
1282
+ # # └─────────────┴──────────┘
1283
+ def slice(offset, length = nil)
1284
+ offset = Utils.parse_as_expression(offset)
1285
+ length = Utils.parse_as_expression(length)
1286
+ Utils.wrap_expr(_rbexpr.str_slice(offset, length))
1287
+ end
1288
+
1289
+ # Returns a column with a separate row for every string character.
1290
+ #
1291
+ # @return [Expr]
1292
+ #
1293
+ # @example
1294
+ # df = Polars::DataFrame.new({"a": ["foo", "bar"]})
1295
+ # df.select(Polars.col("a").str.explode)
1296
+ # # =>
1297
+ # # shape: (6, 1)
1298
+ # # ┌─────┐
1299
+ # # │ a │
1300
+ # # │ --- │
1301
+ # # │ str │
1302
+ # # ╞═════╡
1303
+ # # │ f │
1304
+ # # │ o │
1305
+ # # │ o │
1306
+ # # │ b │
1307
+ # # │ a │
1308
+ # # │ r │
1309
+ # # └─────┘
1310
+ def explode
1311
+ Utils.wrap_expr(_rbexpr.str_explode)
1312
+ end
1313
+
1314
+ # Convert an Utf8 column into an Int64 column with base radix.
1315
+ #
1316
+ # @param base [Integer]
1317
+ # Positive integer which is the base of the string we are parsing.
1318
+ # Default: 10.
1319
+ # @param strict [Boolean]
1320
+ # Bool, default=true will raise any ParseError or overflow as ComputeError.
1321
+ # false silently convert to Null.
1322
+ #
1323
+ # @return [Expr]
1324
+ #
1325
+ # @example
1326
+ # df = Polars::DataFrame.new({"bin" => ["110", "101", "010", "invalid"]})
1327
+ # df.with_columns(Polars.col("bin").str.to_integer(base: 2, strict: false).alias("parsed"))
1328
+ # # =>
1329
+ # # shape: (4, 2)
1330
+ # # ┌─────────┬────────┐
1331
+ # # │ bin ┆ parsed │
1332
+ # # │ --- ┆ --- │
1333
+ # # │ str ┆ i64 │
1334
+ # # ╞═════════╪════════╡
1335
+ # # │ 110 ┆ 6 │
1336
+ # # │ 101 ┆ 5 │
1337
+ # # │ 010 ┆ 2 │
1338
+ # # │ invalid ┆ null │
1339
+ # # └─────────┴────────┘
1340
+ #
1341
+ # @example
1342
+ # df = Polars::DataFrame.new({"hex" => ["fa1e", "ff00", "cafe", nil]})
1343
+ # df.with_columns(Polars.col("hex").str.to_integer(base: 16, strict: true).alias("parsed"))
1344
+ # # =>
1345
+ # # shape: (4, 2)
1346
+ # # ┌──────┬────────┐
1347
+ # # │ hex ┆ parsed │
1348
+ # # │ --- ┆ --- │
1349
+ # # │ str ┆ i64 │
1350
+ # # ╞══════╪════════╡
1351
+ # # │ fa1e ┆ 64030 │
1352
+ # # │ ff00 ┆ 65280 │
1353
+ # # │ cafe ┆ 51966 │
1354
+ # # │ null ┆ null │
1355
+ # # └──────┴────────┘
1356
+ def to_integer(base: 10, strict: true)
1357
+ base = Utils.parse_as_expression(base, str_as_lit: false)
1358
+ Utils.wrap_expr(_rbexpr.str_to_integer(base, strict))
1359
+ end
1360
+
1361
+ # Parse integers with base radix from strings.
1362
+ #
1363
+ # By default base 2. ParseError/Overflows become Nulls.
1364
+ #
1365
+ # @param radix [Integer]
1366
+ # Positive integer which is the base of the string we are parsing.
1367
+ # Default: 2.
1368
+ # @param strict [Boolean]
1369
+ # Bool, Default=true will raise any ParseError or overflow as ComputeError.
1370
+ # False silently convert to Null.
1371
+ #
1372
+ # @return [Expr]
1373
+ #
1374
+ # @example
1375
+ # df = Polars::DataFrame.new({"bin" => ["110", "101", "010", "invalid"]})
1376
+ # df.select(Polars.col("bin").str.parse_int(2, strict: false))
1377
+ # # =>
1378
+ # # shape: (4, 1)
1379
+ # # ┌──────┐
1380
+ # # │ bin │
1381
+ # # │ --- │
1382
+ # # │ i32 │
1383
+ # # ╞══════╡
1384
+ # # │ 6 │
1385
+ # # │ 5 │
1386
+ # # │ 2 │
1387
+ # # │ null │
1388
+ # # └──────┘
1389
+ def parse_int(radix = 2, strict: true)
1390
+ to_integer(base: 2, strict: strict).cast(Int32, strict: strict)
1391
+ end
1392
+
1393
+ # Use the aho-corasick algorithm to find matches.
1394
+ #
1395
+ # This version determines if any of the patterns find a match.
1396
+ #
1397
+ # @param patterns [String]
1398
+ # String patterns to search.
1399
+ # @param ascii_case_insensitive [Boolean]
1400
+ # Enable ASCII-aware case insensitive matching.
1401
+ # When this option is enabled, searching will be performed without respect
1402
+ # to case for ASCII letters (a-z and A-Z) only.
1403
+ #
1404
+ # @return [Expr]
1405
+ #
1406
+ # @example
1407
+ # df = Polars::DataFrame.new(
1408
+ # {
1409
+ # "lyrics": [
1410
+ # "Everybody wants to rule the world",
1411
+ # "Tell me what you want, what you really really want",
1412
+ # "Can you feel the love tonight"
1413
+ # ]
1414
+ # }
1415
+ # )
1416
+ # df.with_columns(
1417
+ # Polars.col("lyrics").str.contains_any(["you", "me"]).alias("contains_any")
1418
+ # )
1419
+ # # =>
1420
+ # # shape: (3, 2)
1421
+ # # ┌───────────────────────────────────┬──────────────┐
1422
+ # # │ lyrics ┆ contains_any │
1423
+ # # │ --- ┆ --- │
1424
+ # # │ str ┆ bool │
1425
+ # # ╞═══════════════════════════════════╪══════════════╡
1426
+ # # │ Everybody wants to rule the worl… ┆ false │
1427
+ # # │ Tell me what you want, what you … ┆ true │
1428
+ # # │ Can you feel the love tonight ┆ true │
1429
+ # # └───────────────────────────────────┴──────────────┘
1430
+ def contains_any(patterns, ascii_case_insensitive: false)
1431
+ patterns = Utils.parse_as_expression(patterns, str_as_lit: false, list_as_lit: false)
1432
+ Utils.wrap_expr(
1433
+ _rbexpr.str_contains_any(patterns, ascii_case_insensitive)
1434
+ )
1435
+ end
1436
+
1437
+ # Use the aho-corasick algorithm to replace many matches.
1438
+ #
1439
+ # @param patterns [String]
1440
+ # String patterns to search and replace.
1441
+ # @param replace_with [String]
1442
+ # Strings to replace where a pattern was a match.
1443
+ # This can be broadcasted. So it supports many:one and many:many.
1444
+ # @param ascii_case_insensitive [Boolean]
1445
+ # Enable ASCII-aware case insensitive matching.
1446
+ # When this option is enabled, searching will be performed without respect
1447
+ # to case for ASCII letters (a-z and A-Z) only.
1448
+ #
1449
+ # @return [Expr]
1450
+ #
1451
+ # @example
1452
+ # df = Polars::DataFrame.new(
1453
+ # {
1454
+ # "lyrics": [
1455
+ # "Everybody wants to rule the world",
1456
+ # "Tell me what you want, what you really really want",
1457
+ # "Can you feel the love tonight"
1458
+ # ]
1459
+ # }
1460
+ # )
1461
+ # df.with_columns(
1462
+ # Polars.col("lyrics")
1463
+ # .str.replace_many(
1464
+ # ["me", "you", "they"],
1465
+ # ""
1466
+ # )
1467
+ # .alias("removes_pronouns")
1468
+ # )
1469
+ # # =>
1470
+ # # shape: (3, 2)
1471
+ # # ┌───────────────────────────────────┬───────────────────────────────────┐
1472
+ # # │ lyrics ┆ removes_pronouns │
1473
+ # # │ --- ┆ --- │
1474
+ # # │ str ┆ str │
1475
+ # # ╞═══════════════════════════════════╪═══════════════════════════════════╡
1476
+ # # │ Everybody wants to rule the worl… ┆ Everybody wants to rule the worl… │
1477
+ # # │ Tell me what you want, what you … ┆ Tell what want, what really r… │
1478
+ # # │ Can you feel the love tonight ┆ Can feel the love tonight │
1479
+ # # └───────────────────────────────────┴───────────────────────────────────┘
1480
+ #
1481
+ # @example
1482
+ # df.with_columns(
1483
+ # Polars.col("lyrics")
1484
+ # .str.replace_many(
1485
+ # ["me", "you"],
1486
+ # ["you", "me"]
1487
+ # )
1488
+ # .alias("confusing")
1489
+ # )
1490
+ # # =>
1491
+ # # shape: (3, 2)
1492
+ # # ┌───────────────────────────────────┬───────────────────────────────────┐
1493
+ # # │ lyrics ┆ confusing │
1494
+ # # │ --- ┆ --- │
1495
+ # # │ str ┆ str │
1496
+ # # ╞═══════════════════════════════════╪═══════════════════════════════════╡
1497
+ # # │ Everybody wants to rule the worl… ┆ Everybody wants to rule the worl… │
1498
+ # # │ Tell me what you want, what you … ┆ Tell you what me want, what me r… │
1499
+ # # │ Can you feel the love tonight ┆ Can me feel the love tonight │
1500
+ # # └───────────────────────────────────┴───────────────────────────────────┘
1501
+ def replace_many(patterns, replace_with, ascii_case_insensitive: false)
1502
+ patterns = Utils.parse_as_expression(patterns, str_as_lit: false, list_as_lit: false)
1503
+ replace_with = Utils.parse_as_expression(
1504
+ replace_with, str_as_lit: true, list_as_lit: false
1505
+ )
1506
+ Utils.wrap_expr(
1507
+ _rbexpr.str_replace_many(
1508
+ patterns, replace_with, ascii_case_insensitive
1509
+ )
1510
+ )
1511
+ end
1512
+
1513
+ private
1514
+
1515
+ def _validate_format_argument(format)
1516
+ # TODO
1517
+ end
1518
+ end
1519
+ end