polars-df 0.10.0-x86_64-linux-musl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +3 -0
  3. data/CHANGELOG.md +175 -0
  4. data/Cargo.lock +2536 -0
  5. data/Cargo.toml +6 -0
  6. data/LICENSE-THIRD-PARTY.txt +38726 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +437 -0
  9. data/lib/polars/3.1/polars.so +0 -0
  10. data/lib/polars/3.2/polars.so +0 -0
  11. data/lib/polars/3.3/polars.so +0 -0
  12. data/lib/polars/array_expr.rb +537 -0
  13. data/lib/polars/array_name_space.rb +423 -0
  14. data/lib/polars/batched_csv_reader.rb +98 -0
  15. data/lib/polars/binary_expr.rb +77 -0
  16. data/lib/polars/binary_name_space.rb +66 -0
  17. data/lib/polars/cat_expr.rb +72 -0
  18. data/lib/polars/cat_name_space.rb +125 -0
  19. data/lib/polars/config.rb +530 -0
  20. data/lib/polars/convert.rb +93 -0
  21. data/lib/polars/data_frame.rb +5418 -0
  22. data/lib/polars/data_types.rb +466 -0
  23. data/lib/polars/date_time_expr.rb +1444 -0
  24. data/lib/polars/date_time_name_space.rb +1484 -0
  25. data/lib/polars/dynamic_group_by.rb +52 -0
  26. data/lib/polars/exceptions.rb +31 -0
  27. data/lib/polars/expr.rb +6105 -0
  28. data/lib/polars/expr_dispatch.rb +22 -0
  29. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  30. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  31. data/lib/polars/functions/as_datatype.rb +248 -0
  32. data/lib/polars/functions/col.rb +47 -0
  33. data/lib/polars/functions/eager.rb +182 -0
  34. data/lib/polars/functions/lazy.rb +1280 -0
  35. data/lib/polars/functions/len.rb +49 -0
  36. data/lib/polars/functions/lit.rb +35 -0
  37. data/lib/polars/functions/random.rb +16 -0
  38. data/lib/polars/functions/range/date_range.rb +103 -0
  39. data/lib/polars/functions/range/int_range.rb +51 -0
  40. data/lib/polars/functions/repeat.rb +144 -0
  41. data/lib/polars/functions/whenthen.rb +96 -0
  42. data/lib/polars/functions.rb +57 -0
  43. data/lib/polars/group_by.rb +548 -0
  44. data/lib/polars/io.rb +890 -0
  45. data/lib/polars/lazy_frame.rb +2833 -0
  46. data/lib/polars/lazy_group_by.rb +84 -0
  47. data/lib/polars/list_expr.rb +791 -0
  48. data/lib/polars/list_name_space.rb +445 -0
  49. data/lib/polars/meta_expr.rb +222 -0
  50. data/lib/polars/name_expr.rb +198 -0
  51. data/lib/polars/plot.rb +109 -0
  52. data/lib/polars/rolling_group_by.rb +37 -0
  53. data/lib/polars/series.rb +4527 -0
  54. data/lib/polars/slice.rb +104 -0
  55. data/lib/polars/sql_context.rb +194 -0
  56. data/lib/polars/string_cache.rb +75 -0
  57. data/lib/polars/string_expr.rb +1519 -0
  58. data/lib/polars/string_name_space.rb +810 -0
  59. data/lib/polars/struct_expr.rb +98 -0
  60. data/lib/polars/struct_name_space.rb +96 -0
  61. data/lib/polars/testing.rb +507 -0
  62. data/lib/polars/utils.rb +422 -0
  63. data/lib/polars/version.rb +4 -0
  64. data/lib/polars/whenthen.rb +83 -0
  65. data/lib/polars-df.rb +1 -0
  66. data/lib/polars.rb +72 -0
  67. metadata +125 -0
@@ -0,0 +1,1519 @@
1
+ module Polars
2
+ # Namespace for string related expressions.
3
+ class StringExpr
4
+ # @private
5
+ attr_accessor :_rbexpr
6
+
7
+ # @private
8
+ def initialize(expr)
9
+ self._rbexpr = expr._rbexpr
10
+ end
11
+
12
+ # Convert a Utf8 column into a Date column.
13
+ #
14
+ # @param format [String]
15
+ # Format to use for conversion. Refer to the
16
+ # [chrono crate documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
17
+ # for the full specification. Example: `"%Y-%m-%d"`.
18
+ # If set to nil (default), the format is inferred from the data.
19
+ # @param strict [Boolean]
20
+ # Raise an error if any conversion fails.
21
+ # @param exact [Boolean]
22
+ # Require an exact format match. If false, allow the format to match anywhere
23
+ # in the target string.
24
+ # @param cache [Boolean]
25
+ # Use a cache of unique, converted dates to apply the conversion.
26
+ #
27
+ # @return [Expr]
28
+ #
29
+ # @example
30
+ # s = Polars::Series.new(["2020/01/01", "2020/02/01", "2020/03/01"])
31
+ # s.str.to_date
32
+ # # =>
33
+ # # shape: (3,)
34
+ # # Series: '' [date]
35
+ # # [
36
+ # # 2020-01-01
37
+ # # 2020-02-01
38
+ # # 2020-03-01
39
+ # # ]
40
+ def to_date(format = nil, strict: true, exact: true, cache: true)
41
+ _validate_format_argument(format)
42
+ Utils.wrap_expr(self._rbexpr.str_to_date(format, strict, exact, cache))
43
+ end
44
+
45
+ # Convert a Utf8 column into a Datetime column.
46
+ #
47
+ # @param format [String]
48
+ # Format to use for conversion. Refer to the
49
+ # [chrono crate documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
50
+ # for the full specification. Example: `"%Y-%m-%d %H:%M:%S"`.
51
+ # If set to nil (default), the format is inferred from the data.
52
+ # @param time_unit ["us", "ns", "ms"]
53
+ # Unit of time for the resulting Datetime column. If set to nil (default),
54
+ # the time unit is inferred from the format string if given, eg:
55
+ # `"%F %T%.3f"` => `Datetime("ms")`. If no fractional second component is
56
+ # found, the default is `"us"`.
57
+ # @param time_zone [String]
58
+ # Time zone for the resulting Datetime column.
59
+ # @param strict [Boolean]
60
+ # Raise an error if any conversion fails.
61
+ # @param exact [Boolean]
62
+ # Require an exact format match. If false, allow the format to match anywhere
63
+ # in the target string.
64
+ # @param cache [Boolean]
65
+ # Use a cache of unique, converted datetimes to apply the conversion.
66
+ #
67
+ # @return [Expr]
68
+ #
69
+ # @example
70
+ # s = Polars::Series.new(["2020-01-01 01:00Z", "2020-01-01 02:00Z"])
71
+ # s.str.to_datetime("%Y-%m-%d %H:%M%#z")
72
+ # # =>
73
+ # # shape: (2,)
74
+ # # Series: '' [datetime[μs, UTC]]
75
+ # # [
76
+ # # 2020-01-01 01:00:00 UTC
77
+ # # 2020-01-01 02:00:00 UTC
78
+ # # ]
79
+ def to_datetime(
80
+ format = nil,
81
+ time_unit: nil,
82
+ time_zone: nil,
83
+ strict: true,
84
+ exact: true,
85
+ cache: true,
86
+ use_earliest: nil,
87
+ ambiguous: "raise"
88
+ )
89
+ _validate_format_argument(format)
90
+ ambiguous = Utils.rename_use_earliest_to_ambiguous(use_earliest, ambiguous)
91
+ ambiguous = Polars.lit(ambiguous) unless ambiguous.is_a?(Expr)
92
+ Utils.wrap_expr(
93
+ self._rbexpr.str_to_datetime(
94
+ format,
95
+ time_unit,
96
+ time_zone,
97
+ strict,
98
+ exact,
99
+ cache,
100
+ ambiguous._rbexpr
101
+ )
102
+ )
103
+ end
104
+
105
+ # Convert a Utf8 column into a Time column.
106
+ #
107
+ # @param format [String]
108
+ # Format to use for conversion. Refer to the
109
+ # [chrono crate documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
110
+ # for the full specification. Example: `"%H:%M:%S"`.
111
+ # If set to nil (default), the format is inferred from the data.
112
+ # @param strict [Boolean]
113
+ # Raise an error if any conversion fails.
114
+ # @param cache [Boolean]
115
+ # Use a cache of unique, converted times to apply the conversion.
116
+ #
117
+ # @return [Expr]
118
+ #
119
+ # @example
120
+ # s = Polars::Series.new(["01:00", "02:00", "03:00"])
121
+ # s.str.to_time("%H:%M")
122
+ # # =>
123
+ # # shape: (3,)
124
+ # # Series: '' [time]
125
+ # # [
126
+ # # 01:00:00
127
+ # # 02:00:00
128
+ # # 03:00:00
129
+ # # ]
130
+ def to_time(format = nil, strict: true, cache: true)
131
+ _validate_format_argument(format)
132
+ Utils.wrap_expr(_rbexpr.str_to_time(format, strict, cache))
133
+ end
134
+
135
+ # Parse a Utf8 expression to a Date/Datetime/Time type.
136
+ #
137
+ # @param dtype [Object]
138
+ # The data type to convert into. Can be either Date, Datetime, or Time.
139
+ # @param format [String]
140
+ # Format to use, refer to the
141
+ # [chrono strftime documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
142
+ # for specification. Example: `"%y-%m-%d"`.
143
+ # @param strict [Boolean]
144
+ # Raise an error if any conversion fails.
145
+ # @param exact [Boolean]
146
+ # - If true, require an exact format match.
147
+ # - If false, allow the format to match anywhere in the target string.
148
+ # @param utc [Boolean]
149
+ # Parse timezone aware datetimes as UTC. This may be useful if you have data
150
+ # with mixed offsets.
151
+ #
152
+ # @return [Expr]
153
+ #
154
+ # @note
155
+ # When parsing a Datetime the column precision will be inferred from
156
+ # the format string, if given, eg: "%F %T%.3f" => Datetime("ms"). If
157
+ # no fractional second component is found then the default is "us".
158
+ #
159
+ # @example Dealing with a consistent format:
160
+ # s = Polars::Series.new(["2020-01-01 01:00Z", "2020-01-01 02:00Z"])
161
+ # s.str.strptime(Polars::Datetime, "%Y-%m-%d %H:%M%#z")
162
+ # # =>
163
+ # # shape: (2,)
164
+ # # Series: '' [datetime[μs, UTC]]
165
+ # # [
166
+ # # 2020-01-01 01:00:00 UTC
167
+ # # 2020-01-01 02:00:00 UTC
168
+ # # ]
169
+ #
170
+ # @example Dealing with different formats.
171
+ # s = Polars::Series.new(
172
+ # "date",
173
+ # [
174
+ # "2021-04-22",
175
+ # "2022-01-04 00:00:00",
176
+ # "01/31/22",
177
+ # "Sun Jul 8 00:34:60 2001",
178
+ # ]
179
+ # )
180
+ # s.to_frame.select(
181
+ # Polars.coalesce(
182
+ # Polars.col("date").str.strptime(Polars::Date, "%F", strict: false),
183
+ # Polars.col("date").str.strptime(Polars::Date, "%F %T", strict: false),
184
+ # Polars.col("date").str.strptime(Polars::Date, "%D", strict: false),
185
+ # Polars.col("date").str.strptime(Polars::Date, "%c", strict: false)
186
+ # )
187
+ # ).to_series
188
+ # # =>
189
+ # # shape: (4,)
190
+ # # Series: 'date' [date]
191
+ # # [
192
+ # # 2021-04-22
193
+ # # 2022-01-04
194
+ # # 2022-01-31
195
+ # # 2001-07-08
196
+ # # ]
197
+ def strptime(dtype, format = nil, strict: true, exact: true, cache: true, utc: false)
198
+ _validate_format_argument(format)
199
+
200
+ if dtype == Date
201
+ to_date(format, strict: strict, exact: exact, cache: cache)
202
+ elsif dtype == Datetime || dtype.is_a?(Datetime)
203
+ dtype = Datetime.new if dtype == Datetime
204
+ time_unit = dtype.time_unit
205
+ time_zone = dtype.time_zone
206
+ to_datetime(format, time_unit: time_unit, time_zone: time_zone, strict: strict, exact: exact, cache: cache)
207
+ elsif dtype == Time
208
+ to_time(format, strict: strict, cache: cache)
209
+ else
210
+ raise ArgumentError, "dtype should be of type {Date, Datetime, Time}"
211
+ end
212
+ end
213
+
214
+ # Convert a String column into a Decimal column.
215
+ #
216
+ # This method infers the needed parameters `precision` and `scale`.
217
+ #
218
+ # @param inference_length [Integer]
219
+ # Number of elements to parse to determine the `precision` and `scale`.
220
+ #
221
+ # @return [Expr]
222
+ #
223
+ # @example
224
+ # df = Polars::DataFrame.new(
225
+ # {
226
+ # "numbers": [
227
+ # "40.12",
228
+ # "3420.13",
229
+ # "120134.19",
230
+ # "3212.98",
231
+ # "12.90",
232
+ # "143.09",
233
+ # "143.9"
234
+ # ]
235
+ # }
236
+ # )
237
+ # df.with_columns(numbers_decimal: Polars.col("numbers").str.to_decimal)
238
+ # # =>
239
+ # # shape: (7, 2)
240
+ # # ┌───────────┬─────────────────┐
241
+ # # │ numbers ┆ numbers_decimal │
242
+ # # │ --- ┆ --- │
243
+ # # │ str ┆ decimal[*,2] │
244
+ # # ╞═══════════╪═════════════════╡
245
+ # # │ 40.12 ┆ 40.12 │
246
+ # # │ 3420.13 ┆ 3420.13 │
247
+ # # │ 120134.19 ┆ 120134.19 │
248
+ # # │ 3212.98 ┆ 3212.98 │
249
+ # # │ 12.90 ┆ 12.90 │
250
+ # # │ 143.09 ┆ 143.09 │
251
+ # # │ 143.9 ┆ 143.90 │
252
+ # # └───────────┴─────────────────┘
253
+ def to_decimal(inference_length = 100)
254
+ Utils.wrap_expr(_rbexpr.str_to_decimal(inference_length))
255
+ end
256
+
257
+ # Get length of the strings as `:u32` (as number of bytes).
258
+ #
259
+ # @return [Expr]
260
+ #
261
+ # @note
262
+ # The returned lengths are equal to the number of bytes in the UTF8 string. If you
263
+ # need the length in terms of the number of characters, use `n_chars` instead.
264
+ #
265
+ # @example
266
+ # df = Polars::DataFrame.new({"s" => ["Café", nil, "345", "東京"]}).with_columns(
267
+ # [
268
+ # Polars.col("s").str.len_bytes.alias("length"),
269
+ # Polars.col("s").str.len_chars.alias("nchars")
270
+ # ]
271
+ # )
272
+ # df
273
+ # # =>
274
+ # # shape: (4, 3)
275
+ # # ┌──────┬────────┬────────┐
276
+ # # │ s ┆ length ┆ nchars │
277
+ # # │ --- ┆ --- ┆ --- │
278
+ # # │ str ┆ u32 ┆ u32 │
279
+ # # ╞══════╪════════╪════════╡
280
+ # # │ Café ┆ 5 ┆ 4 │
281
+ # # │ null ┆ null ┆ null │
282
+ # # │ 345 ┆ 3 ┆ 3 │
283
+ # # │ 東京 ┆ 6 ┆ 2 │
284
+ # # └──────┴────────┴────────┘
285
+ def len_bytes
286
+ Utils.wrap_expr(_rbexpr.str_len_bytes)
287
+ end
288
+ alias_method :lengths, :len_bytes
289
+
290
+ # Get length of the strings as `:u32` (as number of chars).
291
+ #
292
+ # @return [Expr]
293
+ #
294
+ # @note
295
+ # If you know that you are working with ASCII text, `lengths` will be
296
+ # equivalent, and faster (returns length in terms of the number of bytes).
297
+ #
298
+ # @example
299
+ # df = Polars::DataFrame.new({"s" => ["Café", nil, "345", "東京"]}).with_columns(
300
+ # [
301
+ # Polars.col("s").str.len_bytes.alias("length"),
302
+ # Polars.col("s").str.len_chars.alias("nchars")
303
+ # ]
304
+ # )
305
+ # df
306
+ # # =>
307
+ # # shape: (4, 3)
308
+ # # ┌──────┬────────┬────────┐
309
+ # # │ s ┆ length ┆ nchars │
310
+ # # │ --- ┆ --- ┆ --- │
311
+ # # │ str ┆ u32 ┆ u32 │
312
+ # # ╞══════╪════════╪════════╡
313
+ # # │ Café ┆ 5 ┆ 4 │
314
+ # # │ null ┆ null ┆ null │
315
+ # # │ 345 ┆ 3 ┆ 3 │
316
+ # # │ 東京 ┆ 6 ┆ 2 │
317
+ # # └──────┴────────┴────────┘
318
+ def len_chars
319
+ Utils.wrap_expr(_rbexpr.str_len_chars)
320
+ end
321
+ alias_method :n_chars, :len_chars
322
+
323
+ # Vertically concat the values in the Series to a single string value.
324
+ #
325
+ # @param delimiter [String]
326
+ # The delimiter to insert between consecutive string values.
327
+ # @param ignore_nulls [Boolean]
328
+ # Ignore null values (default).
329
+ #
330
+ # @return [Expr]
331
+ #
332
+ # @example
333
+ # df = Polars::DataFrame.new({"foo" => [1, nil, 2]})
334
+ # df.select(Polars.col("foo").str.concat("-"))
335
+ # # =>
336
+ # # shape: (1, 1)
337
+ # # ┌─────┐
338
+ # # │ foo │
339
+ # # │ --- │
340
+ # # │ str │
341
+ # # ╞═════╡
342
+ # # │ 1-2 │
343
+ # # └─────┘
344
+ #
345
+ # @example
346
+ # df = Polars::DataFrame.new({"foo" => [1, nil, 2]})
347
+ # df.select(Polars.col("foo").str.concat("-", ignore_nulls: false))
348
+ # # =>
349
+ # # shape: (1, 1)
350
+ # # ┌──────┐
351
+ # # │ foo │
352
+ # # │ --- │
353
+ # # │ str │
354
+ # # ╞══════╡
355
+ # # │ null │
356
+ # # └──────┘
357
+ def concat(delimiter = "-", ignore_nulls: true)
358
+ Utils.wrap_expr(_rbexpr.str_concat(delimiter, ignore_nulls))
359
+ end
360
+
361
+ # Transform to uppercase variant.
362
+ #
363
+ # @return [Expr]
364
+ #
365
+ # @example
366
+ # df = Polars::DataFrame.new({"foo" => ["cat", "dog"]})
367
+ # df.select(Polars.col("foo").str.to_uppercase)
368
+ # # =>
369
+ # # shape: (2, 1)
370
+ # # ┌─────┐
371
+ # # │ foo │
372
+ # # │ --- │
373
+ # # │ str │
374
+ # # ╞═════╡
375
+ # # │ CAT │
376
+ # # │ DOG │
377
+ # # └─────┘
378
+ def to_uppercase
379
+ Utils.wrap_expr(_rbexpr.str_to_uppercase)
380
+ end
381
+
382
+ # Transform to lowercase variant.
383
+ #
384
+ # @return [Expr]
385
+ #
386
+ # @example
387
+ # df = Polars::DataFrame.new({"foo" => ["CAT", "DOG"]})
388
+ # df.select(Polars.col("foo").str.to_lowercase)
389
+ # # =>
390
+ # # shape: (2, 1)
391
+ # # ┌─────┐
392
+ # # │ foo │
393
+ # # │ --- │
394
+ # # │ str │
395
+ # # ╞═════╡
396
+ # # │ cat │
397
+ # # │ dog │
398
+ # # └─────┘
399
+ def to_lowercase
400
+ Utils.wrap_expr(_rbexpr.str_to_lowercase)
401
+ end
402
+
403
+ # Transform to titlecase variant.
404
+ #
405
+ # @return [Expr]
406
+ #
407
+ # @example
408
+ # df = Polars::DataFrame.new(
409
+ # {"sing": ["welcome to my world", "THERE'S NO TURNING BACK"]}
410
+ # )
411
+ # df.with_columns(foo_title: Polars.col("sing").str.to_titlecase)
412
+ # # =>
413
+ # # shape: (2, 2)
414
+ # # ┌─────────────────────────┬─────────────────────────┐
415
+ # # │ sing ┆ foo_title │
416
+ # # │ --- ┆ --- │
417
+ # # │ str ┆ str │
418
+ # # ╞═════════════════════════╪═════════════════════════╡
419
+ # # │ welcome to my world ┆ Welcome To My World │
420
+ # # │ THERE'S NO TURNING BACK ┆ There's No Turning Back │
421
+ # # └─────────────────────────┴─────────────────────────┘
422
+ def to_titlecase
423
+ raise Todo
424
+ Utils.wrap_expr(_rbexpr.str_to_titlecase)
425
+ end
426
+
427
+ # Remove leading and trailing whitespace.
428
+ #
429
+ # @param characters [String, nil]
430
+ # An optional single character that should be trimmed.
431
+ #
432
+ # @return [Expr]
433
+ #
434
+ # @example
435
+ # df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
436
+ # df.select(Polars.col("foo").str.strip)
437
+ # # =>
438
+ # # shape: (3, 1)
439
+ # # ┌───────┐
440
+ # # │ foo │
441
+ # # │ --- │
442
+ # # │ str │
443
+ # # ╞═══════╡
444
+ # # │ lead │
445
+ # # │ trail │
446
+ # # │ both │
447
+ # # └───────┘
448
+ def strip_chars(characters = nil)
449
+ characters = Utils.parse_as_expression(characters, str_as_lit: true)
450
+ Utils.wrap_expr(_rbexpr.str_strip_chars(characters))
451
+ end
452
+ alias_method :strip, :strip_chars
453
+
454
+ # Remove leading whitespace.
455
+ #
456
+ # @param characters [String, nil]
457
+ # An optional single character that should be trimmed.
458
+ #
459
+ # @return [Expr]
460
+ #
461
+ # @example
462
+ # df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
463
+ # df.select(Polars.col("foo").str.lstrip)
464
+ # # =>
465
+ # # shape: (3, 1)
466
+ # # ┌────────┐
467
+ # # │ foo │
468
+ # # │ --- │
469
+ # # │ str │
470
+ # # ╞════════╡
471
+ # # │ lead │
472
+ # # │ trail │
473
+ # # │ both │
474
+ # # └────────┘
475
+ def strip_chars_start(characters = nil)
476
+ characters = Utils.parse_as_expression(characters, str_as_lit: true)
477
+ Utils.wrap_expr(_rbexpr.str_strip_chars_start(characters))
478
+ end
479
+ alias_method :lstrip, :strip_chars_start
480
+
481
+ # Remove trailing whitespace.
482
+ #
483
+ # @param characters [String, nil]
484
+ # An optional single character that should be trimmed.
485
+ #
486
+ # @return [Expr]
487
+ #
488
+ # @example
489
+ # df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
490
+ # df.select(Polars.col("foo").str.rstrip)
491
+ # # =>
492
+ # # shape: (3, 1)
493
+ # # ┌───────┐
494
+ # # │ foo │
495
+ # # │ --- │
496
+ # # │ str │
497
+ # # ╞═══════╡
498
+ # # │ lead │
499
+ # # │ trail │
500
+ # # │ both │
501
+ # # └───────┘
502
+ def strip_chars_end(characters = nil)
503
+ characters = Utils.parse_as_expression(characters, str_as_lit: true)
504
+ Utils.wrap_expr(_rbexpr.str_strip_chars_end(characters))
505
+ end
506
+ alias_method :rstrip, :strip_chars_end
507
+
508
+ # Remove prefix.
509
+ #
510
+ # The prefix will be removed from the string exactly once, if found.
511
+ #
512
+ # @param prefix [String]
513
+ # The prefix to be removed.
514
+ #
515
+ # @return [Expr]
516
+ #
517
+ # @example
518
+ # df = Polars::DataFrame.new({"a" => ["foobar", "foofoobar", "foo", "bar"]})
519
+ # df.with_columns(Polars.col("a").str.strip_prefix("foo").alias("stripped"))
520
+ # # =>
521
+ # # shape: (4, 2)
522
+ # # ┌───────────┬──────────┐
523
+ # # │ a ┆ stripped │
524
+ # # │ --- ┆ --- │
525
+ # # │ str ┆ str │
526
+ # # ╞═══════════╪══════════╡
527
+ # # │ foobar ┆ bar │
528
+ # # │ foofoobar ┆ foobar │
529
+ # # │ foo ┆ │
530
+ # # │ bar ┆ bar │
531
+ # # └───────────┴──────────┘
532
+ def strip_prefix(prefix)
533
+ prefix = Utils.parse_as_expression(prefix, str_as_lit: true)
534
+ Utils.wrap_expr(_rbexpr.str_strip_prefix(prefix))
535
+ end
536
+
537
+ # Remove suffix.
538
+ #
539
+ # The suffix will be removed from the string exactly once, if found.
540
+ #
541
+ #
542
+ # @param suffix [String]
543
+ # The suffix to be removed.
544
+ #
545
+ # @return [Expr]
546
+ #
547
+ # @example
548
+ # df = Polars::DataFrame.new({"a" => ["foobar", "foobarbar", "foo", "bar"]})
549
+ # df.with_columns(Polars.col("a").str.strip_suffix("bar").alias("stripped"))
550
+ # # =>
551
+ # # shape: (4, 2)
552
+ # # ┌───────────┬──────────┐
553
+ # # │ a ┆ stripped │
554
+ # # │ --- ┆ --- │
555
+ # # │ str ┆ str │
556
+ # # ╞═══════════╪══════════╡
557
+ # # │ foobar ┆ foo │
558
+ # # │ foobarbar ┆ foobar │
559
+ # # │ foo ┆ foo │
560
+ # # │ bar ┆ │
561
+ # # └───────────┴──────────┘
562
+ def strip_suffix(suffix)
563
+ suffix = Utils.parse_as_expression(suffix, str_as_lit: true)
564
+ Utils.wrap_expr(_rbexpr.str_strip_suffix(suffix))
565
+ end
566
+
567
+ # Pad the start of the string until it reaches the given length.
568
+ #
569
+ # @param length [Integer]
570
+ # Pad the string until it reaches this length. Strings with length equal to
571
+ # or greater than this value are returned as-is.
572
+ # @param fill_char [String]
573
+ # The character to pad the string with.
574
+ #
575
+ # @return [Expr]
576
+ #
577
+ # @example
578
+ # df = Polars::DataFrame.new({"a": ["cow", "monkey", "hippopotamus", nil]})
579
+ # df.with_columns(padded: Polars.col("a").str.pad_start(8, "*"))
580
+ # # =>
581
+ # # shape: (4, 2)
582
+ # # ┌──────────────┬──────────────┐
583
+ # # │ a ┆ padded │
584
+ # # │ --- ┆ --- │
585
+ # # │ str ┆ str │
586
+ # # ╞══════════════╪══════════════╡
587
+ # # │ cow ┆ *****cow │
588
+ # # │ monkey ┆ **monkey │
589
+ # # │ hippopotamus ┆ hippopotamus │
590
+ # # │ null ┆ null │
591
+ # # └──────────────┴──────────────┘
592
+ def pad_start(length, fill_char = " ")
593
+ Utils.wrap_expr(_rbexpr.str_pad_start(length, fill_char))
594
+ end
595
+ alias_method :rjust, :pad_start
596
+
597
+ # Pad the end of the string until it reaches the given length.
598
+ #
599
+ # @param length [Integer]
600
+ # Pad the string until it reaches this length. Strings with length equal to
601
+ # or greater than this value are returned as-is.
602
+ # @param fill_char [String]
603
+ # The character to pad the string with.
604
+ #
605
+ # @return [Expr]
606
+ #
607
+ # @example
608
+ # df = Polars::DataFrame.new({"a": ["cow", "monkey", "hippopotamus", nil]})
609
+ # df.with_columns(padded: Polars.col("a").str.pad_end(8, "*"))
610
+ # # =>
611
+ # # shape: (4, 2)
612
+ # # ┌──────────────┬──────────────┐
613
+ # # │ a ┆ padded │
614
+ # # │ --- ┆ --- │
615
+ # # │ str ┆ str │
616
+ # # ╞══════════════╪══════════════╡
617
+ # # │ cow ┆ cow***** │
618
+ # # │ monkey ┆ monkey** │
619
+ # # │ hippopotamus ┆ hippopotamus │
620
+ # # │ null ┆ null │
621
+ # # └──────────────┴──────────────┘
622
+ def pad_end(length, fill_char = " ")
623
+ Utils.wrap_expr(_rbexpr.str_pad_end(length, fill_char))
624
+ end
625
+ alias_method :ljust, :pad_end
626
+
627
+ # Fills the string with zeroes.
628
+ #
629
+ # Return a copy of the string left filled with ASCII '0' digits to make a string
630
+ # of length width.
631
+ #
632
+ # A leading sign prefix ('+'/'-') is handled by inserting the padding after the
633
+ # sign character rather than before. The original string is returned if width is
634
+ # less than or equal to `s.length`.
635
+ #
636
+ # @param length [Integer]
637
+ # Fill the value up to this length
638
+ #
639
+ # @return [Expr]
640
+ #
641
+ # @example
642
+ # df = Polars::DataFrame.new({"a" => [-1, 123, 999999, nil]})
643
+ # df.with_columns(Polars.col("a").cast(Polars::String).str.zfill(4).alias("zfill"))
644
+ # # =>
645
+ # # shape: (4, 2)
646
+ # # ┌────────┬────────┐
647
+ # # │ a ┆ zfill │
648
+ # # │ --- ┆ --- │
649
+ # # │ i64 ┆ str │
650
+ # # ╞════════╪════════╡
651
+ # # │ -1 ┆ -001 │
652
+ # # │ 123 ┆ 0123 │
653
+ # # │ 999999 ┆ 999999 │
654
+ # # │ null ┆ null │
655
+ # # └────────┴────────┘
656
+ def zfill(length)
657
+ length = Utils.parse_as_expression(length)
658
+ Utils.wrap_expr(_rbexpr.str_zfill(length))
659
+ end
660
+
661
+ # Check if string contains a substring that matches a regex.
662
+ #
663
+ # @param pattern [String]
664
+ # A valid regex pattern.
665
+ # @param literal [Boolean]
666
+ # Treat pattern as a literal string.
667
+ #
668
+ # @return [Expr]
669
+ #
670
+ # @example
671
+ # df = Polars::DataFrame.new({"a" => ["Crab", "cat and dog", "rab$bit", nil]})
672
+ # df.select(
673
+ # [
674
+ # Polars.col("a"),
675
+ # Polars.col("a").str.contains("cat|bit").alias("regex"),
676
+ # Polars.col("a").str.contains("rab$", literal: true).alias("literal")
677
+ # ]
678
+ # )
679
+ # # =>
680
+ # # shape: (4, 3)
681
+ # # ┌─────────────┬───────┬─────────┐
682
+ # # │ a ┆ regex ┆ literal │
683
+ # # │ --- ┆ --- ┆ --- │
684
+ # # │ str ┆ bool ┆ bool │
685
+ # # ╞═════════════╪═══════╪═════════╡
686
+ # # │ Crab ┆ false ┆ false │
687
+ # # │ cat and dog ┆ true ┆ false │
688
+ # # │ rab$bit ┆ true ┆ true │
689
+ # # │ null ┆ null ┆ null │
690
+ # # └─────────────┴───────┴─────────┘
691
+ def contains(pattern, literal: false, strict: true)
692
+ pattern = Utils.expr_to_lit_or_expr(pattern, str_to_lit: true)._rbexpr
693
+ Utils.wrap_expr(_rbexpr.str_contains(pattern, literal, strict))
694
+ end
695
+
696
+ # Check if string values end with a substring.
697
+ #
698
+ # @param sub [String]
699
+ # Suffix substring.
700
+ #
701
+ # @return [Expr]
702
+ #
703
+ # @example
704
+ # df = Polars::DataFrame.new({"fruits" => ["apple", "mango", nil]})
705
+ # df.with_column(
706
+ # Polars.col("fruits").str.ends_with("go").alias("has_suffix")
707
+ # )
708
+ # # =>
709
+ # # shape: (3, 2)
710
+ # # ┌────────┬────────────┐
711
+ # # │ fruits ┆ has_suffix │
712
+ # # │ --- ┆ --- │
713
+ # # │ str ┆ bool │
714
+ # # ╞════════╪════════════╡
715
+ # # │ apple ┆ false │
716
+ # # │ mango ┆ true │
717
+ # # │ null ┆ null │
718
+ # # └────────┴────────────┘
719
+ #
720
+ # @example Using `ends_with` as a filter condition:
721
+ # df.filter(Polars.col("fruits").str.ends_with("go"))
722
+ # # =>
723
+ # # shape: (1, 1)
724
+ # # ┌────────┐
725
+ # # │ fruits │
726
+ # # │ --- │
727
+ # # │ str │
728
+ # # ╞════════╡
729
+ # # │ mango │
730
+ # # └────────┘
731
+ def ends_with(sub)
732
+ sub = Utils.expr_to_lit_or_expr(sub, str_to_lit: true)._rbexpr
733
+ Utils.wrap_expr(_rbexpr.str_ends_with(sub))
734
+ end
735
+
736
+ # Check if string values start with a substring.
737
+ #
738
+ # @param sub [String]
739
+ # Prefix substring.
740
+ #
741
+ # @return [Expr]
742
+ #
743
+ # @example
744
+ # df = Polars::DataFrame.new({"fruits" => ["apple", "mango", nil]})
745
+ # df.with_column(
746
+ # Polars.col("fruits").str.starts_with("app").alias("has_prefix")
747
+ # )
748
+ # # =>
749
+ # # shape: (3, 2)
750
+ # # ┌────────┬────────────┐
751
+ # # │ fruits ┆ has_prefix │
752
+ # # │ --- ┆ --- │
753
+ # # │ str ┆ bool │
754
+ # # ╞════════╪════════════╡
755
+ # # │ apple ┆ true │
756
+ # # │ mango ┆ false │
757
+ # # │ null ┆ null │
758
+ # # └────────┴────────────┘
759
+ #
760
+ # @example Using `starts_with` as a filter condition:
761
+ # df.filter(Polars.col("fruits").str.starts_with("app"))
762
+ # # =>
763
+ # # shape: (1, 1)
764
+ # # ┌────────┐
765
+ # # │ fruits │
766
+ # # │ --- │
767
+ # # │ str │
768
+ # # ╞════════╡
769
+ # # │ apple │
770
+ # # └────────┘
771
+ def starts_with(sub)
772
+ sub = Utils.expr_to_lit_or_expr(sub, str_to_lit: true)._rbexpr
773
+ Utils.wrap_expr(_rbexpr.str_starts_with(sub))
774
+ end
775
+
776
+ # Parse string values as JSON.
777
+ #
778
+ # Throw errors if encounter invalid JSON strings.
779
+ #
780
+ # @param dtype [Object]
781
+ # The dtype to cast the extracted value to. If nil, the dtype will be
782
+ # inferred from the JSON value.
783
+ #
784
+ # @return [Expr]
785
+ #
786
+ # @example
787
+ # df = Polars::DataFrame.new(
788
+ # {"json" => ['{"a":1, "b": true}', nil, '{"a":2, "b": false}']}
789
+ # )
790
+ # dtype = Polars::Struct.new([Polars::Field.new("a", Polars::Int64), Polars::Field.new("b", Polars::Boolean)])
791
+ # df.select(Polars.col("json").str.json_decode(dtype))
792
+ # # =>
793
+ # # shape: (3, 1)
794
+ # # ┌─────────────┐
795
+ # # │ json │
796
+ # # │ --- │
797
+ # # │ struct[2] │
798
+ # # ╞═════════════╡
799
+ # # │ {1,true} │
800
+ # # │ {null,null} │
801
+ # # │ {2,false} │
802
+ # # └─────────────┘
803
+ def json_decode(dtype = nil, infer_schema_length: 100)
804
+ if !dtype.nil?
805
+ dtype = Utils.rb_type_to_dtype(dtype)
806
+ end
807
+ Utils.wrap_expr(_rbexpr.str_json_decode(dtype, infer_schema_length))
808
+ end
809
+ alias_method :json_extract, :json_decode
810
+
811
+ # Extract the first match of json string with provided JSONPath expression.
812
+ #
813
+ # Throw errors if encounter invalid json strings.
814
+ # All return value will be casted to Utf8 regardless of the original value.
815
+ #
816
+ # Documentation on JSONPath standard can be found
817
+ # [here](https://goessner.net/articles/JsonPath/).
818
+ #
819
+ # @param json_path [String]
820
+ # A valid JSON path query string.
821
+ #
822
+ # @return [Expr]
823
+ #
824
+ # @example
825
+ # df = Polars::DataFrame.new(
826
+ # {"json_val" => ['{"a":"1"}', nil, '{"a":2}', '{"a":2.1}', '{"a":true}']}
827
+ # )
828
+ # df.select(Polars.col("json_val").str.json_path_match("$.a"))
829
+ # # =>
830
+ # # shape: (5, 1)
831
+ # # ┌──────────┐
832
+ # # │ json_val │
833
+ # # │ --- │
834
+ # # │ str │
835
+ # # ╞══════════╡
836
+ # # │ 1 │
837
+ # # │ null │
838
+ # # │ 2 │
839
+ # # │ 2.1 │
840
+ # # │ true │
841
+ # # └──────────┘
842
+ def json_path_match(json_path)
843
+ Utils.wrap_expr(_rbexpr.str_json_path_match(json_path))
844
+ end
845
+
846
+ # Decode a value using the provided encoding.
847
+ #
848
+ # @param encoding ["hex", "base64"]
849
+ # The encoding to use.
850
+ # @param strict [Boolean]
851
+ # How to handle invalid inputs:
852
+ #
853
+ # - `true`: An error will be thrown if unable to decode a value.
854
+ # - `false`: Unhandled values will be replaced with `nil`.
855
+ #
856
+ # @return [Expr]
857
+ #
858
+ # @example
859
+ # df = Polars::DataFrame.new({"encoded" => ["666f6f", "626172", nil]})
860
+ # df.select(Polars.col("encoded").str.decode("hex"))
861
+ # # =>
862
+ # # shape: (3, 1)
863
+ # # ┌─────────┐
864
+ # # │ encoded │
865
+ # # │ --- │
866
+ # # │ binary │
867
+ # # ╞═════════╡
868
+ # # │ b"foo" │
869
+ # # │ b"bar" │
870
+ # # │ null │
871
+ # # └─────────┘
872
+ def decode(encoding, strict: true)
873
+ if encoding == "hex"
874
+ Utils.wrap_expr(_rbexpr.str_hex_decode(strict))
875
+ elsif encoding == "base64"
876
+ Utils.wrap_expr(_rbexpr.str_base64_decode(strict))
877
+ else
878
+ raise ArgumentError, "encoding must be one of {{'hex', 'base64'}}, got #{encoding}"
879
+ end
880
+ end
881
+
882
+ # Encode a value using the provided encoding.
883
+ #
884
+ # @param encoding ["hex", "base64"]
885
+ # The encoding to use.
886
+ #
887
+ # @return [Expr]
888
+ #
889
+ # @example
890
+ # df = Polars::DataFrame.new({"strings" => ["foo", "bar", nil]})
891
+ # df.select(Polars.col("strings").str.encode("hex"))
892
+ # # =>
893
+ # # shape: (3, 1)
894
+ # # ┌─────────┐
895
+ # # │ strings │
896
+ # # │ --- │
897
+ # # │ str │
898
+ # # ╞═════════╡
899
+ # # │ 666f6f │
900
+ # # │ 626172 │
901
+ # # │ null │
902
+ # # └─────────┘
903
+ def encode(encoding)
904
+ if encoding == "hex"
905
+ Utils.wrap_expr(_rbexpr.str_hex_encode)
906
+ elsif encoding == "base64"
907
+ Utils.wrap_expr(_rbexpr.str_base64_encode)
908
+ else
909
+ raise ArgumentError, "encoding must be one of {{'hex', 'base64'}}, got #{encoding}"
910
+ end
911
+ end
912
+
913
+ # Extract the target capture group from provided patterns.
914
+ #
915
+ # @param pattern [String]
916
+ # A valid regex pattern
917
+ # @param group_index [Integer]
918
+ # Index of the targeted capture group.
919
+ # Group 0 mean the whole pattern, first group begin at index 1
920
+ # Default to the first capture group
921
+ #
922
+ # @return [Expr]
923
+ #
924
+ # @example
925
+ # df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
926
+ # df.select(
927
+ # [
928
+ # Polars.col("foo").str.extract('(\d+)')
929
+ # ]
930
+ # )
931
+ # # =>
932
+ # # shape: (2, 1)
933
+ # # ┌─────┐
934
+ # # │ foo │
935
+ # # │ --- │
936
+ # # │ str │
937
+ # # ╞═════╡
938
+ # # │ 123 │
939
+ # # │ 678 │
940
+ # # └─────┘
941
+ def extract(pattern, group_index: 1)
942
+ pattern = Utils.parse_as_expression(pattern, str_as_lit: true)
943
+ Utils.wrap_expr(_rbexpr.str_extract(pattern, group_index))
944
+ end
945
+
946
+ # Extracts all matches for the given regex pattern.
947
+ #
948
+ # Extracts each successive non-overlapping regex match in an individual string as
949
+ # an array.
950
+ #
951
+ # @param pattern [String]
952
+ # A valid regex pattern
953
+ #
954
+ # @return [Expr]
955
+ #
956
+ # @example
957
+ # df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
958
+ # df.select(
959
+ # [
960
+ # Polars.col("foo").str.extract_all('(\d+)').alias("extracted_nrs")
961
+ # ]
962
+ # )
963
+ # # =>
964
+ # # shape: (2, 1)
965
+ # # ┌────────────────┐
966
+ # # │ extracted_nrs │
967
+ # # │ --- │
968
+ # # │ list[str] │
969
+ # # ╞════════════════╡
970
+ # # │ ["123", "45"] │
971
+ # # │ ["678", "910"] │
972
+ # # └────────────────┘
973
+ def extract_all(pattern)
974
+ pattern = Utils.expr_to_lit_or_expr(pattern, str_to_lit: true)
975
+ Utils.wrap_expr(_rbexpr.str_extract_all(pattern._rbexpr))
976
+ end
977
+
978
+ # Extract all capture groups for the given regex pattern.
979
+ #
980
+ # @param pattern [String]
981
+ # A valid regular expression pattern containing at least one capture group,
982
+ # compatible with the [regex crate](https://docs.rs/regex/latest/regex/).
983
+ #
984
+ # @return [Expr]
985
+ #
986
+ # @example
987
+ # df = Polars::DataFrame.new(
988
+ # {
989
+ # "url": [
990
+ # "http://vote.com/ballon_dor?candidate=messi&ref=python",
991
+ # "http://vote.com/ballon_dor?candidate=weghorst&ref=polars",
992
+ # "http://vote.com/ballon_dor?error=404&ref=rust"
993
+ # ]
994
+ # }
995
+ # )
996
+ # pattern = /candidate=(?<candidate>\w+)&ref=(?<ref>\w+)/.to_s
997
+ # df.select(captures: Polars.col("url").str.extract_groups(pattern)).unnest(
998
+ # "captures"
999
+ # )
1000
+ # # =>
1001
+ # # shape: (3, 2)
1002
+ # # ┌───────────┬────────┐
1003
+ # # │ candidate ┆ ref │
1004
+ # # │ --- ┆ --- │
1005
+ # # │ str ┆ str │
1006
+ # # ╞═══════════╪════════╡
1007
+ # # │ messi ┆ python │
1008
+ # # │ weghorst ┆ polars │
1009
+ # # │ null ┆ null │
1010
+ # # └───────────┴────────┘
1011
+ #
1012
+ # @example Unnamed groups have their numerical position converted to a string:
1013
+ # pattern = /candidate=(\w+)&ref=(\w+)/.to_s
1014
+ # (
1015
+ # df.with_columns(
1016
+ # captures: Polars.col("url").str.extract_groups(pattern)
1017
+ # ).with_columns(name: Polars.col("captures").struct["1"].str.to_uppercase)
1018
+ # )
1019
+ # # =>
1020
+ # # shape: (3, 3)
1021
+ # # ┌───────────────────────────────────┬───────────────────────┬──────────┐
1022
+ # # │ url ┆ captures ┆ name │
1023
+ # # │ --- ┆ --- ┆ --- │
1024
+ # # │ str ┆ struct[2] ┆ str │
1025
+ # # ╞═══════════════════════════════════╪═══════════════════════╪══════════╡
1026
+ # # │ http://vote.com/ballon_dor?candi… ┆ {"messi","python"} ┆ MESSI │
1027
+ # # │ http://vote.com/ballon_dor?candi… ┆ {"weghorst","polars"} ┆ WEGHORST │
1028
+ # # │ http://vote.com/ballon_dor?error… ┆ {null,null} ┆ null │
1029
+ # # └───────────────────────────────────┴───────────────────────┴──────────┘
1030
+ def extract_groups(pattern)
1031
+ Utils.wrap_expr(_rbexpr.str_extract_groups(pattern))
1032
+ end
1033
+
1034
+ # Count all successive non-overlapping regex matches.
1035
+ #
1036
+ # @param pattern [String]
1037
+ # A valid regex pattern
1038
+ #
1039
+ # @return [Expr]
1040
+ #
1041
+ # @example
1042
+ # df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
1043
+ # df.select(
1044
+ # [
1045
+ # Polars.col("foo").str.count_match('\d').alias("count_digits")
1046
+ # ]
1047
+ # )
1048
+ # # =>
1049
+ # # shape: (2, 1)
1050
+ # # ┌──────────────┐
1051
+ # # │ count_digits │
1052
+ # # │ --- │
1053
+ # # │ u32 │
1054
+ # # ╞══════════════╡
1055
+ # # │ 5 │
1056
+ # # │ 6 │
1057
+ # # └──────────────┘
1058
+ def count_matches(pattern, literal: false)
1059
+ pattern = Utils.parse_as_expression(pattern, str_as_lit: true)
1060
+ Utils.wrap_expr(_rbexpr.str_count_matches(pattern, literal))
1061
+ end
1062
+ alias_method :count_match, :count_matches
1063
+
1064
+ # Split the string by a substring.
1065
+ #
1066
+ # @param by [String]
1067
+ # Substring to split by.
1068
+ # @param inclusive [Boolean]
1069
+ # If true, include the split character/string in the results.
1070
+ #
1071
+ # @return [Expr]
1072
+ #
1073
+ # @example
1074
+ # df = Polars::DataFrame.new({"s" => ["foo bar", "foo-bar", "foo bar baz"]})
1075
+ # df.select(Polars.col("s").str.split(" "))
1076
+ # # =>
1077
+ # # shape: (3, 1)
1078
+ # # ┌───────────────────────┐
1079
+ # # │ s │
1080
+ # # │ --- │
1081
+ # # │ list[str] │
1082
+ # # ╞═══════════════════════╡
1083
+ # # │ ["foo", "bar"] │
1084
+ # # │ ["foo-bar"] │
1085
+ # # │ ["foo", "bar", "baz"] │
1086
+ # # └───────────────────────┘
1087
+ def split(by, inclusive: false)
1088
+ by = Utils.parse_as_expression(by, str_as_lit: true)
1089
+ if inclusive
1090
+ Utils.wrap_expr(_rbexpr.str_split_inclusive(by))
1091
+ else
1092
+ Utils.wrap_expr(_rbexpr.str_split(by))
1093
+ end
1094
+ end
1095
+
1096
+ # Split the string by a substring using `n` splits.
1097
+ #
1098
+ # Results in a struct of `n+1` fields.
1099
+ #
1100
+ # If it cannot make `n` splits, the remaining field elements will be null.
1101
+ #
1102
+ # @param by [String]
1103
+ # Substring to split by.
1104
+ # @param n [Integer]
1105
+ # Number of splits to make.
1106
+ # @param inclusive [Boolean]
1107
+ # If true, include the split character/string in the results.
1108
+ #
1109
+ # @return [Expr]
1110
+ #
1111
+ # @example
1112
+ # df = Polars::DataFrame.new({"x" => ["a_1", nil, "c", "d_4"]})
1113
+ # df.select(
1114
+ # [
1115
+ # Polars.col("x").str.split_exact("_", 1).alias("fields")
1116
+ # ]
1117
+ # )
1118
+ # # =>
1119
+ # # shape: (4, 1)
1120
+ # # ┌─────────────┐
1121
+ # # │ fields │
1122
+ # # │ --- │
1123
+ # # │ struct[2] │
1124
+ # # ╞═════════════╡
1125
+ # # │ {"a","1"} │
1126
+ # # │ {null,null} │
1127
+ # # │ {"c",null} │
1128
+ # # │ {"d","4"} │
1129
+ # # └─────────────┘
1130
+ def split_exact(by, n, inclusive: false)
1131
+ by = Utils.parse_as_expression(by, str_as_lit: true)
1132
+ if inclusive
1133
+ Utils.wrap_expr(_rbexpr.str_split_exact_inclusive(by, n))
1134
+ else
1135
+ Utils.wrap_expr(_rbexpr.str_split_exact(by, n))
1136
+ end
1137
+ end
1138
+
1139
+ # Split the string by a substring, restricted to returning at most `n` items.
1140
+ #
1141
+ # If the number of possible splits is less than `n-1`, the remaining field
1142
+ # elements will be null. If the number of possible splits is `n-1` or greater,
1143
+ # the last (nth) substring will contain the remainder of the string.
1144
+ #
1145
+ # @param by [String]
1146
+ # Substring to split by.
1147
+ # @param n [Integer]
1148
+ # Max number of items to return.
1149
+ #
1150
+ # @return [Expr]
1151
+ #
1152
+ # @example
1153
+ # df = Polars::DataFrame.new({"s" => ["foo bar", nil, "foo-bar", "foo bar baz"]})
1154
+ # df.select(Polars.col("s").str.splitn(" ", 2).alias("fields"))
1155
+ # # =>
1156
+ # # shape: (4, 1)
1157
+ # # ┌───────────────────┐
1158
+ # # │ fields │
1159
+ # # │ --- │
1160
+ # # │ struct[2] │
1161
+ # # ╞═══════════════════╡
1162
+ # # │ {"foo","bar"} │
1163
+ # # │ {null,null} │
1164
+ # # │ {"foo-bar",null} │
1165
+ # # │ {"foo","bar baz"} │
1166
+ # # └───────────────────┘
1167
+ def splitn(by, n)
1168
+ by = Utils.parse_as_expression(by, str_as_lit: true)
1169
+ Utils.wrap_expr(_rbexpr.str_splitn(by, n))
1170
+ end
1171
+
1172
+ # Replace first matching regex/literal substring with a new string value.
1173
+ #
1174
+ # @param pattern [String]
1175
+ # Regex pattern.
1176
+ # @param value [String]
1177
+ # Replacement string.
1178
+ # @param literal [Boolean]
1179
+ # Treat pattern as a literal string.
1180
+ #
1181
+ # @return [Expr]
1182
+ #
1183
+ # @example
1184
+ # df = Polars::DataFrame.new({"id" => [1, 2], "text" => ["123abc", "abc456"]})
1185
+ # df.with_column(
1186
+ # Polars.col("text").str.replace('abc\b', "ABC")
1187
+ # )
1188
+ # # =>
1189
+ # # shape: (2, 2)
1190
+ # # ┌─────┬────────┐
1191
+ # # │ id ┆ text │
1192
+ # # │ --- ┆ --- │
1193
+ # # │ i64 ┆ str │
1194
+ # # ╞═════╪════════╡
1195
+ # # │ 1 ┆ 123ABC │
1196
+ # # │ 2 ┆ abc456 │
1197
+ # # └─────┴────────┘
1198
+ def replace(pattern, value, literal: false, n: 1)
1199
+ pattern = Utils.expr_to_lit_or_expr(pattern, str_to_lit: true)
1200
+ value = Utils.expr_to_lit_or_expr(value, str_to_lit: true)
1201
+ Utils.wrap_expr(_rbexpr.str_replace_n(pattern._rbexpr, value._rbexpr, literal, n))
1202
+ end
1203
+
1204
+ # Replace all matching regex/literal substrings with a new string value.
1205
+ #
1206
+ # @param pattern [String]
1207
+ # Regex pattern.
1208
+ # @param value [String]
1209
+ # Replacement string.
1210
+ # @param literal [Boolean]
1211
+ # Treat pattern as a literal string.
1212
+ #
1213
+ # @return [Expr]
1214
+ #
1215
+ # @example
1216
+ # df = Polars::DataFrame.new({"id" => [1, 2], "text" => ["abcabc", "123a123"]})
1217
+ # df.with_column(Polars.col("text").str.replace_all("a", "-"))
1218
+ # # =>
1219
+ # # shape: (2, 2)
1220
+ # # ┌─────┬─────────┐
1221
+ # # │ id ┆ text │
1222
+ # # │ --- ┆ --- │
1223
+ # # │ i64 ┆ str │
1224
+ # # ╞═════╪═════════╡
1225
+ # # │ 1 ┆ -bc-bc │
1226
+ # # │ 2 ┆ 123-123 │
1227
+ # # └─────┴─────────┘
1228
+ def replace_all(pattern, value, literal: false)
1229
+ pattern = Utils.expr_to_lit_or_expr(pattern, str_to_lit: true)
1230
+ value = Utils.expr_to_lit_or_expr(value, str_to_lit: true)
1231
+ Utils.wrap_expr(_rbexpr.str_replace_all(pattern._rbexpr, value._rbexpr, literal))
1232
+ end
1233
+
1234
+ # Returns string values in reversed order.
1235
+ #
1236
+ # @return [Expr]
1237
+ #
1238
+ # @example
1239
+ # df = Polars::DataFrame.new({"text" => ["foo", "bar", "man\u0303ana"]})
1240
+ # df.with_columns(Polars.col("text").str.reverse.alias("reversed"))
1241
+ # # =>
1242
+ # # shape: (3, 2)
1243
+ # # ┌────────┬──────────┐
1244
+ # # │ text ┆ reversed │
1245
+ # # │ --- ┆ --- │
1246
+ # # │ str ┆ str │
1247
+ # # ╞════════╪══════════╡
1248
+ # # │ foo ┆ oof │
1249
+ # # │ bar ┆ rab │
1250
+ # # │ mañana ┆ anañam │
1251
+ # # └────────┴──────────┘
1252
+ def reverse
1253
+ Utils.wrap_expr(_rbexpr.str_reverse)
1254
+ end
1255
+
1256
+ # Create subslices of the string values of a Utf8 Series.
1257
+ #
1258
+ # @param offset [Integer]
1259
+ # Start index. Negative indexing is supported.
1260
+ # @param length [Integer]
1261
+ # Length of the slice. If set to `nil` (default), the slice is taken to the
1262
+ # end of the string.
1263
+ #
1264
+ # @return [Expr]
1265
+ #
1266
+ # @example
1267
+ # df = Polars::DataFrame.new({"s" => ["pear", nil, "papaya", "dragonfruit"]})
1268
+ # df.with_column(
1269
+ # Polars.col("s").str.slice(-3).alias("s_sliced")
1270
+ # )
1271
+ # # =>
1272
+ # # shape: (4, 2)
1273
+ # # ┌─────────────┬──────────┐
1274
+ # # │ s ┆ s_sliced │
1275
+ # # │ --- ┆ --- │
1276
+ # # │ str ┆ str │
1277
+ # # ╞═════════════╪══════════╡
1278
+ # # │ pear ┆ ear │
1279
+ # # │ null ┆ null │
1280
+ # # │ papaya ┆ aya │
1281
+ # # │ dragonfruit ┆ uit │
1282
+ # # └─────────────┴──────────┘
1283
+ def slice(offset, length = nil)
1284
+ offset = Utils.parse_as_expression(offset)
1285
+ length = Utils.parse_as_expression(length)
1286
+ Utils.wrap_expr(_rbexpr.str_slice(offset, length))
1287
+ end
1288
+
1289
+ # Returns a column with a separate row for every string character.
1290
+ #
1291
+ # @return [Expr]
1292
+ #
1293
+ # @example
1294
+ # df = Polars::DataFrame.new({"a": ["foo", "bar"]})
1295
+ # df.select(Polars.col("a").str.explode)
1296
+ # # =>
1297
+ # # shape: (6, 1)
1298
+ # # ┌─────┐
1299
+ # # │ a │
1300
+ # # │ --- │
1301
+ # # │ str │
1302
+ # # ╞═════╡
1303
+ # # │ f │
1304
+ # # │ o │
1305
+ # # │ o │
1306
+ # # │ b │
1307
+ # # │ a │
1308
+ # # │ r │
1309
+ # # └─────┘
1310
+ def explode
1311
+ Utils.wrap_expr(_rbexpr.str_explode)
1312
+ end
1313
+
1314
+ # Convert an Utf8 column into an Int64 column with base radix.
1315
+ #
1316
+ # @param base [Integer]
1317
+ # Positive integer which is the base of the string we are parsing.
1318
+ # Default: 10.
1319
+ # @param strict [Boolean]
1320
+ # Bool, default=true will raise any ParseError or overflow as ComputeError.
1321
+ # false silently convert to Null.
1322
+ #
1323
+ # @return [Expr]
1324
+ #
1325
+ # @example
1326
+ # df = Polars::DataFrame.new({"bin" => ["110", "101", "010", "invalid"]})
1327
+ # df.with_columns(Polars.col("bin").str.to_integer(base: 2, strict: false).alias("parsed"))
1328
+ # # =>
1329
+ # # shape: (4, 2)
1330
+ # # ┌─────────┬────────┐
1331
+ # # │ bin ┆ parsed │
1332
+ # # │ --- ┆ --- │
1333
+ # # │ str ┆ i64 │
1334
+ # # ╞═════════╪════════╡
1335
+ # # │ 110 ┆ 6 │
1336
+ # # │ 101 ┆ 5 │
1337
+ # # │ 010 ┆ 2 │
1338
+ # # │ invalid ┆ null │
1339
+ # # └─────────┴────────┘
1340
+ #
1341
+ # @example
1342
+ # df = Polars::DataFrame.new({"hex" => ["fa1e", "ff00", "cafe", nil]})
1343
+ # df.with_columns(Polars.col("hex").str.to_integer(base: 16, strict: true).alias("parsed"))
1344
+ # # =>
1345
+ # # shape: (4, 2)
1346
+ # # ┌──────┬────────┐
1347
+ # # │ hex ┆ parsed │
1348
+ # # │ --- ┆ --- │
1349
+ # # │ str ┆ i64 │
1350
+ # # ╞══════╪════════╡
1351
+ # # │ fa1e ┆ 64030 │
1352
+ # # │ ff00 ┆ 65280 │
1353
+ # # │ cafe ┆ 51966 │
1354
+ # # │ null ┆ null │
1355
+ # # └──────┴────────┘
1356
+ def to_integer(base: 10, strict: true)
1357
+ base = Utils.parse_as_expression(base, str_as_lit: false)
1358
+ Utils.wrap_expr(_rbexpr.str_to_integer(base, strict))
1359
+ end
1360
+
1361
+ # Parse integers with base radix from strings.
1362
+ #
1363
+ # By default base 2. ParseError/Overflows become Nulls.
1364
+ #
1365
+ # @param radix [Integer]
1366
+ # Positive integer which is the base of the string we are parsing.
1367
+ # Default: 2.
1368
+ # @param strict [Boolean]
1369
+ # Bool, Default=true will raise any ParseError or overflow as ComputeError.
1370
+ # False silently convert to Null.
1371
+ #
1372
+ # @return [Expr]
1373
+ #
1374
+ # @example
1375
+ # df = Polars::DataFrame.new({"bin" => ["110", "101", "010", "invalid"]})
1376
+ # df.select(Polars.col("bin").str.parse_int(2, strict: false))
1377
+ # # =>
1378
+ # # shape: (4, 1)
1379
+ # # ┌──────┐
1380
+ # # │ bin │
1381
+ # # │ --- │
1382
+ # # │ i32 │
1383
+ # # ╞══════╡
1384
+ # # │ 6 │
1385
+ # # │ 5 │
1386
+ # # │ 2 │
1387
+ # # │ null │
1388
+ # # └──────┘
1389
+ def parse_int(radix = 2, strict: true)
1390
+ to_integer(base: 2, strict: strict).cast(Int32, strict: strict)
1391
+ end
1392
+
1393
+ # Use the aho-corasick algorithm to find matches.
1394
+ #
1395
+ # This version determines if any of the patterns find a match.
1396
+ #
1397
+ # @param patterns [String]
1398
+ # String patterns to search.
1399
+ # @param ascii_case_insensitive [Boolean]
1400
+ # Enable ASCII-aware case insensitive matching.
1401
+ # When this option is enabled, searching will be performed without respect
1402
+ # to case for ASCII letters (a-z and A-Z) only.
1403
+ #
1404
+ # @return [Expr]
1405
+ #
1406
+ # @example
1407
+ # df = Polars::DataFrame.new(
1408
+ # {
1409
+ # "lyrics": [
1410
+ # "Everybody wants to rule the world",
1411
+ # "Tell me what you want, what you really really want",
1412
+ # "Can you feel the love tonight"
1413
+ # ]
1414
+ # }
1415
+ # )
1416
+ # df.with_columns(
1417
+ # Polars.col("lyrics").str.contains_any(["you", "me"]).alias("contains_any")
1418
+ # )
1419
+ # # =>
1420
+ # # shape: (3, 2)
1421
+ # # ┌───────────────────────────────────┬──────────────┐
1422
+ # # │ lyrics ┆ contains_any │
1423
+ # # │ --- ┆ --- │
1424
+ # # │ str ┆ bool │
1425
+ # # ╞═══════════════════════════════════╪══════════════╡
1426
+ # # │ Everybody wants to rule the worl… ┆ false │
1427
+ # # │ Tell me what you want, what you … ┆ true │
1428
+ # # │ Can you feel the love tonight ┆ true │
1429
+ # # └───────────────────────────────────┴──────────────┘
1430
+ def contains_any(patterns, ascii_case_insensitive: false)
1431
+ patterns = Utils.parse_as_expression(patterns, str_as_lit: false, list_as_lit: false)
1432
+ Utils.wrap_expr(
1433
+ _rbexpr.str_contains_any(patterns, ascii_case_insensitive)
1434
+ )
1435
+ end
1436
+
1437
+ # Use the aho-corasick algorithm to replace many matches.
1438
+ #
1439
+ # @param patterns [String]
1440
+ # String patterns to search and replace.
1441
+ # @param replace_with [String]
1442
+ # Strings to replace where a pattern was a match.
1443
+ # This can be broadcasted. So it supports many:one and many:many.
1444
+ # @param ascii_case_insensitive [Boolean]
1445
+ # Enable ASCII-aware case insensitive matching.
1446
+ # When this option is enabled, searching will be performed without respect
1447
+ # to case for ASCII letters (a-z and A-Z) only.
1448
+ #
1449
+ # @return [Expr]
1450
+ #
1451
+ # @example
1452
+ # df = Polars::DataFrame.new(
1453
+ # {
1454
+ # "lyrics": [
1455
+ # "Everybody wants to rule the world",
1456
+ # "Tell me what you want, what you really really want",
1457
+ # "Can you feel the love tonight"
1458
+ # ]
1459
+ # }
1460
+ # )
1461
+ # df.with_columns(
1462
+ # Polars.col("lyrics")
1463
+ # .str.replace_many(
1464
+ # ["me", "you", "they"],
1465
+ # ""
1466
+ # )
1467
+ # .alias("removes_pronouns")
1468
+ # )
1469
+ # # =>
1470
+ # # shape: (3, 2)
1471
+ # # ┌───────────────────────────────────┬───────────────────────────────────┐
1472
+ # # │ lyrics ┆ removes_pronouns │
1473
+ # # │ --- ┆ --- │
1474
+ # # │ str ┆ str │
1475
+ # # ╞═══════════════════════════════════╪═══════════════════════════════════╡
1476
+ # # │ Everybody wants to rule the worl… ┆ Everybody wants to rule the worl… │
1477
+ # # │ Tell me what you want, what you … ┆ Tell what want, what really r… │
1478
+ # # │ Can you feel the love tonight ┆ Can feel the love tonight │
1479
+ # # └───────────────────────────────────┴───────────────────────────────────┘
1480
+ #
1481
+ # @example
1482
+ # df.with_columns(
1483
+ # Polars.col("lyrics")
1484
+ # .str.replace_many(
1485
+ # ["me", "you"],
1486
+ # ["you", "me"]
1487
+ # )
1488
+ # .alias("confusing")
1489
+ # )
1490
+ # # =>
1491
+ # # shape: (3, 2)
1492
+ # # ┌───────────────────────────────────┬───────────────────────────────────┐
1493
+ # # │ lyrics ┆ confusing │
1494
+ # # │ --- ┆ --- │
1495
+ # # │ str ┆ str │
1496
+ # # ╞═══════════════════════════════════╪═══════════════════════════════════╡
1497
+ # # │ Everybody wants to rule the worl… ┆ Everybody wants to rule the worl… │
1498
+ # # │ Tell me what you want, what you … ┆ Tell you what me want, what me r… │
1499
+ # # │ Can you feel the love tonight ┆ Can me feel the love tonight │
1500
+ # # └───────────────────────────────────┴───────────────────────────────────┘
1501
+ def replace_many(patterns, replace_with, ascii_case_insensitive: false)
1502
+ patterns = Utils.parse_as_expression(patterns, str_as_lit: false, list_as_lit: false)
1503
+ replace_with = Utils.parse_as_expression(
1504
+ replace_with, str_as_lit: true, list_as_lit: false
1505
+ )
1506
+ Utils.wrap_expr(
1507
+ _rbexpr.str_replace_many(
1508
+ patterns, replace_with, ascii_case_insensitive
1509
+ )
1510
+ )
1511
+ end
1512
+
1513
+ private
1514
+
1515
+ def _validate_format_argument(format)
1516
+ # TODO
1517
+ end
1518
+ end
1519
+ end