polars-df 0.13.0-x64-mingw-ucrt

Sign up to get free protection for your applications and to get access to all the features.
Files changed (80) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +3 -0
  3. data/CHANGELOG.md +208 -0
  4. data/Cargo.lock +2556 -0
  5. data/Cargo.toml +6 -0
  6. data/LICENSE-THIRD-PARTY.txt +39278 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +437 -0
  9. data/lib/polars/3.1/polars.so +0 -0
  10. data/lib/polars/3.2/polars.so +0 -0
  11. data/lib/polars/3.3/polars.so +0 -0
  12. data/lib/polars/array_expr.rb +537 -0
  13. data/lib/polars/array_name_space.rb +423 -0
  14. data/lib/polars/batched_csv_reader.rb +104 -0
  15. data/lib/polars/binary_expr.rb +77 -0
  16. data/lib/polars/binary_name_space.rb +66 -0
  17. data/lib/polars/cat_expr.rb +36 -0
  18. data/lib/polars/cat_name_space.rb +88 -0
  19. data/lib/polars/config.rb +530 -0
  20. data/lib/polars/convert.rb +98 -0
  21. data/lib/polars/data_frame.rb +5191 -0
  22. data/lib/polars/data_types.rb +466 -0
  23. data/lib/polars/date_time_expr.rb +1397 -0
  24. data/lib/polars/date_time_name_space.rb +1287 -0
  25. data/lib/polars/dynamic_group_by.rb +52 -0
  26. data/lib/polars/exceptions.rb +38 -0
  27. data/lib/polars/expr.rb +7256 -0
  28. data/lib/polars/expr_dispatch.rb +22 -0
  29. data/lib/polars/functions/aggregation/horizontal.rb +246 -0
  30. data/lib/polars/functions/aggregation/vertical.rb +282 -0
  31. data/lib/polars/functions/as_datatype.rb +271 -0
  32. data/lib/polars/functions/col.rb +47 -0
  33. data/lib/polars/functions/eager.rb +182 -0
  34. data/lib/polars/functions/lazy.rb +1329 -0
  35. data/lib/polars/functions/len.rb +49 -0
  36. data/lib/polars/functions/lit.rb +35 -0
  37. data/lib/polars/functions/random.rb +16 -0
  38. data/lib/polars/functions/range/date_range.rb +136 -0
  39. data/lib/polars/functions/range/datetime_range.rb +149 -0
  40. data/lib/polars/functions/range/int_range.rb +51 -0
  41. data/lib/polars/functions/range/time_range.rb +141 -0
  42. data/lib/polars/functions/repeat.rb +144 -0
  43. data/lib/polars/functions/whenthen.rb +96 -0
  44. data/lib/polars/functions.rb +57 -0
  45. data/lib/polars/group_by.rb +613 -0
  46. data/lib/polars/io/avro.rb +24 -0
  47. data/lib/polars/io/csv.rb +696 -0
  48. data/lib/polars/io/database.rb +73 -0
  49. data/lib/polars/io/ipc.rb +275 -0
  50. data/lib/polars/io/json.rb +29 -0
  51. data/lib/polars/io/ndjson.rb +80 -0
  52. data/lib/polars/io/parquet.rb +233 -0
  53. data/lib/polars/lazy_frame.rb +2708 -0
  54. data/lib/polars/lazy_group_by.rb +181 -0
  55. data/lib/polars/list_expr.rb +791 -0
  56. data/lib/polars/list_name_space.rb +449 -0
  57. data/lib/polars/meta_expr.rb +222 -0
  58. data/lib/polars/name_expr.rb +198 -0
  59. data/lib/polars/plot.rb +109 -0
  60. data/lib/polars/rolling_group_by.rb +35 -0
  61. data/lib/polars/series.rb +4444 -0
  62. data/lib/polars/slice.rb +104 -0
  63. data/lib/polars/sql_context.rb +194 -0
  64. data/lib/polars/string_cache.rb +75 -0
  65. data/lib/polars/string_expr.rb +1495 -0
  66. data/lib/polars/string_name_space.rb +811 -0
  67. data/lib/polars/struct_expr.rb +98 -0
  68. data/lib/polars/struct_name_space.rb +96 -0
  69. data/lib/polars/testing.rb +507 -0
  70. data/lib/polars/utils/constants.rb +9 -0
  71. data/lib/polars/utils/convert.rb +97 -0
  72. data/lib/polars/utils/parse.rb +89 -0
  73. data/lib/polars/utils/various.rb +76 -0
  74. data/lib/polars/utils/wrap.rb +19 -0
  75. data/lib/polars/utils.rb +130 -0
  76. data/lib/polars/version.rb +4 -0
  77. data/lib/polars/whenthen.rb +83 -0
  78. data/lib/polars-df.rb +1 -0
  79. data/lib/polars.rb +91 -0
  80. metadata +138 -0
@@ -0,0 +1,1495 @@
1
+ module Polars
2
+ # Namespace for string related expressions.
3
+ class StringExpr
4
+ # @private
5
+ attr_accessor :_rbexpr
6
+
7
+ # @private
8
+ def initialize(expr)
9
+ self._rbexpr = expr._rbexpr
10
+ end
11
+
12
+ # Convert a Utf8 column into a Date column.
13
+ #
14
+ # @param format [String]
15
+ # Format to use for conversion. Refer to the
16
+ # [chrono crate documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
17
+ # for the full specification. Example: `"%Y-%m-%d"`.
18
+ # If set to nil (default), the format is inferred from the data.
19
+ # @param strict [Boolean]
20
+ # Raise an error if any conversion fails.
21
+ # @param exact [Boolean]
22
+ # Require an exact format match. If false, allow the format to match anywhere
23
+ # in the target string.
24
+ # @param cache [Boolean]
25
+ # Use a cache of unique, converted dates to apply the conversion.
26
+ #
27
+ # @return [Expr]
28
+ #
29
+ # @example
30
+ # s = Polars::Series.new(["2020/01/01", "2020/02/01", "2020/03/01"])
31
+ # s.str.to_date
32
+ # # =>
33
+ # # shape: (3,)
34
+ # # Series: '' [date]
35
+ # # [
36
+ # # 2020-01-01
37
+ # # 2020-02-01
38
+ # # 2020-03-01
39
+ # # ]
40
+ def to_date(format = nil, strict: true, exact: true, cache: true)
41
+ _validate_format_argument(format)
42
+ Utils.wrap_expr(_rbexpr.str_to_date(format, strict, exact, cache))
43
+ end
44
+
45
+ # Convert a Utf8 column into a Datetime column.
46
+ #
47
+ # @param format [String]
48
+ # Format to use for conversion. Refer to the
49
+ # [chrono crate documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
50
+ # for the full specification. Example: `"%Y-%m-%d %H:%M:%S"`.
51
+ # If set to nil (default), the format is inferred from the data.
52
+ # @param time_unit ["us", "ns", "ms"]
53
+ # Unit of time for the resulting Datetime column. If set to nil (default),
54
+ # the time unit is inferred from the format string if given, eg:
55
+ # `"%F %T%.3f"` => `Datetime("ms")`. If no fractional second component is
56
+ # found, the default is `"us"`.
57
+ # @param time_zone [String]
58
+ # Time zone for the resulting Datetime column.
59
+ # @param strict [Boolean]
60
+ # Raise an error if any conversion fails.
61
+ # @param exact [Boolean]
62
+ # Require an exact format match. If false, allow the format to match anywhere
63
+ # in the target string.
64
+ # @param cache [Boolean]
65
+ # Use a cache of unique, converted datetimes to apply the conversion.
66
+ #
67
+ # @return [Expr]
68
+ #
69
+ # @example
70
+ # s = Polars::Series.new(["2020-01-01 01:00Z", "2020-01-01 02:00Z"])
71
+ # s.str.to_datetime("%Y-%m-%d %H:%M%#z")
72
+ # # =>
73
+ # # shape: (2,)
74
+ # # Series: '' [datetime[μs, UTC]]
75
+ # # [
76
+ # # 2020-01-01 01:00:00 UTC
77
+ # # 2020-01-01 02:00:00 UTC
78
+ # # ]
79
+ def to_datetime(
80
+ format = nil,
81
+ time_unit: nil,
82
+ time_zone: nil,
83
+ strict: true,
84
+ exact: true,
85
+ cache: true,
86
+ ambiguous: "raise"
87
+ )
88
+ _validate_format_argument(format)
89
+ unless ambiguous.is_a?(Expr)
90
+ ambiguous = Polars.lit(ambiguous)
91
+ end
92
+ Utils.wrap_expr(
93
+ _rbexpr.str_to_datetime(
94
+ format,
95
+ time_unit,
96
+ time_zone,
97
+ strict,
98
+ exact,
99
+ cache,
100
+ ambiguous._rbexpr
101
+ )
102
+ )
103
+ end
104
+
105
+ # Convert a Utf8 column into a Time column.
106
+ #
107
+ # @param format [String]
108
+ # Format to use for conversion. Refer to the
109
+ # [chrono crate documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
110
+ # for the full specification. Example: `"%H:%M:%S"`.
111
+ # If set to nil (default), the format is inferred from the data.
112
+ # @param strict [Boolean]
113
+ # Raise an error if any conversion fails.
114
+ # @param cache [Boolean]
115
+ # Use a cache of unique, converted times to apply the conversion.
116
+ #
117
+ # @return [Expr]
118
+ #
119
+ # @example
120
+ # s = Polars::Series.new(["01:00", "02:00", "03:00"])
121
+ # s.str.to_time("%H:%M")
122
+ # # =>
123
+ # # shape: (3,)
124
+ # # Series: '' [time]
125
+ # # [
126
+ # # 01:00:00
127
+ # # 02:00:00
128
+ # # 03:00:00
129
+ # # ]
130
+ def to_time(format = nil, strict: true, cache: true)
131
+ _validate_format_argument(format)
132
+ Utils.wrap_expr(_rbexpr.str_to_time(format, strict, cache))
133
+ end
134
+
135
+ # Parse a Utf8 expression to a Date/Datetime/Time type.
136
+ #
137
+ # @param dtype [Object]
138
+ # The data type to convert into. Can be either Date, Datetime, or Time.
139
+ # @param format [String]
140
+ # Format to use, refer to the
141
+ # [chrono strftime documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
142
+ # for specification. Example: `"%y-%m-%d"`.
143
+ # @param strict [Boolean]
144
+ # Raise an error if any conversion fails.
145
+ # @param exact [Boolean]
146
+ # - If true, require an exact format match.
147
+ # - If false, allow the format to match anywhere in the target string.
148
+ # @param utc [Boolean]
149
+ # Parse timezone aware datetimes as UTC. This may be useful if you have data
150
+ # with mixed offsets.
151
+ #
152
+ # @return [Expr]
153
+ #
154
+ # @note
155
+ # When parsing a Datetime the column precision will be inferred from
156
+ # the format string, if given, eg: "%F %T%.3f" => Datetime("ms"). If
157
+ # no fractional second component is found then the default is "us".
158
+ #
159
+ # @example Dealing with a consistent format:
160
+ # s = Polars::Series.new(["2020-01-01 01:00Z", "2020-01-01 02:00Z"])
161
+ # s.str.strptime(Polars::Datetime, "%Y-%m-%d %H:%M%#z")
162
+ # # =>
163
+ # # shape: (2,)
164
+ # # Series: '' [datetime[μs, UTC]]
165
+ # # [
166
+ # # 2020-01-01 01:00:00 UTC
167
+ # # 2020-01-01 02:00:00 UTC
168
+ # # ]
169
+ #
170
+ # @example Dealing with different formats.
171
+ # s = Polars::Series.new(
172
+ # "date",
173
+ # [
174
+ # "2021-04-22",
175
+ # "2022-01-04 00:00:00",
176
+ # "01/31/22",
177
+ # "Sun Jul 8 00:34:60 2001",
178
+ # ]
179
+ # )
180
+ # s.to_frame.select(
181
+ # Polars.coalesce(
182
+ # Polars.col("date").str.strptime(Polars::Date, "%F", strict: false),
183
+ # Polars.col("date").str.strptime(Polars::Date, "%F %T", strict: false),
184
+ # Polars.col("date").str.strptime(Polars::Date, "%D", strict: false),
185
+ # Polars.col("date").str.strptime(Polars::Date, "%c", strict: false)
186
+ # )
187
+ # ).to_series
188
+ # # =>
189
+ # # shape: (4,)
190
+ # # Series: 'date' [date]
191
+ # # [
192
+ # # 2021-04-22
193
+ # # 2022-01-04
194
+ # # 2022-01-31
195
+ # # 2001-07-08
196
+ # # ]
197
+ def strptime(dtype, format = nil, strict: true, exact: true, cache: true, utc: false)
198
+ _validate_format_argument(format)
199
+
200
+ if dtype == Date
201
+ to_date(format, strict: strict, exact: exact, cache: cache)
202
+ elsif dtype == Datetime || dtype.is_a?(Datetime)
203
+ dtype = Datetime.new if dtype == Datetime
204
+ time_unit = dtype.time_unit
205
+ time_zone = dtype.time_zone
206
+ to_datetime(format, time_unit: time_unit, time_zone: time_zone, strict: strict, exact: exact, cache: cache)
207
+ elsif dtype == Time
208
+ to_time(format, strict: strict, cache: cache)
209
+ else
210
+ raise ArgumentError, "dtype should be of type {Date, Datetime, Time}"
211
+ end
212
+ end
213
+
214
+ # Convert a String column into a Decimal column.
215
+ #
216
+ # This method infers the needed parameters `precision` and `scale`.
217
+ #
218
+ # @param inference_length [Integer]
219
+ # Number of elements to parse to determine the `precision` and `scale`.
220
+ #
221
+ # @return [Expr]
222
+ #
223
+ # @example
224
+ # df = Polars::DataFrame.new(
225
+ # {
226
+ # "numbers": [
227
+ # "40.12",
228
+ # "3420.13",
229
+ # "120134.19",
230
+ # "3212.98",
231
+ # "12.90",
232
+ # "143.09",
233
+ # "143.9"
234
+ # ]
235
+ # }
236
+ # )
237
+ # df.with_columns(numbers_decimal: Polars.col("numbers").str.to_decimal)
238
+ # # =>
239
+ # # shape: (7, 2)
240
+ # # ┌───────────┬─────────────────┐
241
+ # # │ numbers ┆ numbers_decimal │
242
+ # # │ --- ┆ --- │
243
+ # # │ str ┆ decimal[*,2] │
244
+ # # ╞═══════════╪═════════════════╡
245
+ # # │ 40.12 ┆ 40.12 │
246
+ # # │ 3420.13 ┆ 3420.13 │
247
+ # # │ 120134.19 ┆ 120134.19 │
248
+ # # │ 3212.98 ┆ 3212.98 │
249
+ # # │ 12.90 ┆ 12.90 │
250
+ # # │ 143.09 ┆ 143.09 │
251
+ # # │ 143.9 ┆ 143.90 │
252
+ # # └───────────┴─────────────────┘
253
+ def to_decimal(inference_length = 100)
254
+ Utils.wrap_expr(_rbexpr.str_to_decimal(inference_length))
255
+ end
256
+
257
+ # Get length of the strings as `:u32` (as number of bytes).
258
+ #
259
+ # @return [Expr]
260
+ #
261
+ # @note
262
+ # The returned lengths are equal to the number of bytes in the UTF8 string. If you
263
+ # need the length in terms of the number of characters, use `n_chars` instead.
264
+ #
265
+ # @example
266
+ # df = Polars::DataFrame.new({"s" => ["Café", nil, "345", "東京"]}).with_columns(
267
+ # [
268
+ # Polars.col("s").str.len_bytes.alias("length"),
269
+ # Polars.col("s").str.len_chars.alias("nchars")
270
+ # ]
271
+ # )
272
+ # df
273
+ # # =>
274
+ # # shape: (4, 3)
275
+ # # ┌──────┬────────┬────────┐
276
+ # # │ s ┆ length ┆ nchars │
277
+ # # │ --- ┆ --- ┆ --- │
278
+ # # │ str ┆ u32 ┆ u32 │
279
+ # # ╞══════╪════════╪════════╡
280
+ # # │ Café ┆ 5 ┆ 4 │
281
+ # # │ null ┆ null ┆ null │
282
+ # # │ 345 ┆ 3 ┆ 3 │
283
+ # # │ 東京 ┆ 6 ┆ 2 │
284
+ # # └──────┴────────┴────────┘
285
+ def len_bytes
286
+ Utils.wrap_expr(_rbexpr.str_len_bytes)
287
+ end
288
+ alias_method :lengths, :len_bytes
289
+
290
+ # Get length of the strings as `:u32` (as number of chars).
291
+ #
292
+ # @return [Expr]
293
+ #
294
+ # @note
295
+ # If you know that you are working with ASCII text, `lengths` will be
296
+ # equivalent, and faster (returns length in terms of the number of bytes).
297
+ #
298
+ # @example
299
+ # df = Polars::DataFrame.new({"s" => ["Café", nil, "345", "東京"]}).with_columns(
300
+ # [
301
+ # Polars.col("s").str.len_bytes.alias("length"),
302
+ # Polars.col("s").str.len_chars.alias("nchars")
303
+ # ]
304
+ # )
305
+ # df
306
+ # # =>
307
+ # # shape: (4, 3)
308
+ # # ┌──────┬────────┬────────┐
309
+ # # │ s ┆ length ┆ nchars │
310
+ # # │ --- ┆ --- ┆ --- │
311
+ # # │ str ┆ u32 ┆ u32 │
312
+ # # ╞══════╪════════╪════════╡
313
+ # # │ Café ┆ 5 ┆ 4 │
314
+ # # │ null ┆ null ┆ null │
315
+ # # │ 345 ┆ 3 ┆ 3 │
316
+ # # │ 東京 ┆ 6 ┆ 2 │
317
+ # # └──────┴────────┴────────┘
318
+ def len_chars
319
+ Utils.wrap_expr(_rbexpr.str_len_chars)
320
+ end
321
+ alias_method :n_chars, :len_chars
322
+
323
+ # Vertically concat the values in the Series to a single string value.
324
+ #
325
+ # @param delimiter [String]
326
+ # The delimiter to insert between consecutive string values.
327
+ # @param ignore_nulls [Boolean]
328
+ # Ignore null values (default).
329
+ #
330
+ # @return [Expr]
331
+ #
332
+ # @example
333
+ # df = Polars::DataFrame.new({"foo" => [1, nil, 2]})
334
+ # df.select(Polars.col("foo").str.join("-"))
335
+ # # =>
336
+ # # shape: (1, 1)
337
+ # # ┌─────┐
338
+ # # │ foo │
339
+ # # │ --- │
340
+ # # │ str │
341
+ # # ╞═════╡
342
+ # # │ 1-2 │
343
+ # # └─────┘
344
+ #
345
+ # @example
346
+ # df = Polars::DataFrame.new({"foo" => [1, nil, 2]})
347
+ # df.select(Polars.col("foo").str.join("-", ignore_nulls: false))
348
+ # # =>
349
+ # # shape: (1, 1)
350
+ # # ┌──────┐
351
+ # # │ foo │
352
+ # # │ --- │
353
+ # # │ str │
354
+ # # ╞══════╡
355
+ # # │ null │
356
+ # # └──────┘
357
+ def join(delimiter = "-", ignore_nulls: true)
358
+ Utils.wrap_expr(_rbexpr.str_join(delimiter, ignore_nulls))
359
+ end
360
+ alias_method :concat, :join
361
+
362
+ # Transform to uppercase variant.
363
+ #
364
+ # @return [Expr]
365
+ #
366
+ # @example
367
+ # df = Polars::DataFrame.new({"foo" => ["cat", "dog"]})
368
+ # df.select(Polars.col("foo").str.to_uppercase)
369
+ # # =>
370
+ # # shape: (2, 1)
371
+ # # ┌─────┐
372
+ # # │ foo │
373
+ # # │ --- │
374
+ # # │ str │
375
+ # # ╞═════╡
376
+ # # │ CAT │
377
+ # # │ DOG │
378
+ # # └─────┘
379
+ def to_uppercase
380
+ Utils.wrap_expr(_rbexpr.str_to_uppercase)
381
+ end
382
+
383
+ # Transform to lowercase variant.
384
+ #
385
+ # @return [Expr]
386
+ #
387
+ # @example
388
+ # df = Polars::DataFrame.new({"foo" => ["CAT", "DOG"]})
389
+ # df.select(Polars.col("foo").str.to_lowercase)
390
+ # # =>
391
+ # # shape: (2, 1)
392
+ # # ┌─────┐
393
+ # # │ foo │
394
+ # # │ --- │
395
+ # # │ str │
396
+ # # ╞═════╡
397
+ # # │ cat │
398
+ # # │ dog │
399
+ # # └─────┘
400
+ def to_lowercase
401
+ Utils.wrap_expr(_rbexpr.str_to_lowercase)
402
+ end
403
+
404
+ # Transform to titlecase variant.
405
+ #
406
+ # @return [Expr]
407
+ #
408
+ # @example
409
+ # df = Polars::DataFrame.new(
410
+ # {"sing": ["welcome to my world", "THERE'S NO TURNING BACK"]}
411
+ # )
412
+ # df.with_columns(foo_title: Polars.col("sing").str.to_titlecase)
413
+ # # =>
414
+ # # shape: (2, 2)
415
+ # # ┌─────────────────────────┬─────────────────────────┐
416
+ # # │ sing ┆ foo_title │
417
+ # # │ --- ┆ --- │
418
+ # # │ str ┆ str │
419
+ # # ╞═════════════════════════╪═════════════════════════╡
420
+ # # │ welcome to my world ┆ Welcome To My World │
421
+ # # │ THERE'S NO TURNING BACK ┆ There's No Turning Back │
422
+ # # └─────────────────────────┴─────────────────────────┘
423
+ def to_titlecase
424
+ raise Todo
425
+ Utils.wrap_expr(_rbexpr.str_to_titlecase)
426
+ end
427
+
428
+ # Remove leading and trailing whitespace.
429
+ #
430
+ # @param characters [String, nil]
431
+ # An optional single character that should be trimmed.
432
+ #
433
+ # @return [Expr]
434
+ #
435
+ # @example
436
+ # df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
437
+ # df.select(Polars.col("foo").str.strip)
438
+ # # =>
439
+ # # shape: (3, 1)
440
+ # # ┌───────┐
441
+ # # │ foo │
442
+ # # │ --- │
443
+ # # │ str │
444
+ # # ╞═══════╡
445
+ # # │ lead │
446
+ # # │ trail │
447
+ # # │ both │
448
+ # # └───────┘
449
+ def strip_chars(characters = nil)
450
+ characters = Utils.parse_into_expression(characters, str_as_lit: true)
451
+ Utils.wrap_expr(_rbexpr.str_strip_chars(characters))
452
+ end
453
+ alias_method :strip, :strip_chars
454
+
455
+ # Remove leading whitespace.
456
+ #
457
+ # @param characters [String, nil]
458
+ # An optional single character that should be trimmed.
459
+ #
460
+ # @return [Expr]
461
+ #
462
+ # @example
463
+ # df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
464
+ # df.select(Polars.col("foo").str.lstrip)
465
+ # # =>
466
+ # # shape: (3, 1)
467
+ # # ┌────────┐
468
+ # # │ foo │
469
+ # # │ --- │
470
+ # # │ str │
471
+ # # ╞════════╡
472
+ # # │ lead │
473
+ # # │ trail │
474
+ # # │ both │
475
+ # # └────────┘
476
+ def strip_chars_start(characters = nil)
477
+ characters = Utils.parse_into_expression(characters, str_as_lit: true)
478
+ Utils.wrap_expr(_rbexpr.str_strip_chars_start(characters))
479
+ end
480
+ alias_method :lstrip, :strip_chars_start
481
+
482
+ # Remove trailing whitespace.
483
+ #
484
+ # @param characters [String, nil]
485
+ # An optional single character that should be trimmed.
486
+ #
487
+ # @return [Expr]
488
+ #
489
+ # @example
490
+ # df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
491
+ # df.select(Polars.col("foo").str.rstrip)
492
+ # # =>
493
+ # # shape: (3, 1)
494
+ # # ┌───────┐
495
+ # # │ foo │
496
+ # # │ --- │
497
+ # # │ str │
498
+ # # ╞═══════╡
499
+ # # │ lead │
500
+ # # │ trail │
501
+ # # │ both │
502
+ # # └───────┘
503
+ def strip_chars_end(characters = nil)
504
+ characters = Utils.parse_into_expression(characters, str_as_lit: true)
505
+ Utils.wrap_expr(_rbexpr.str_strip_chars_end(characters))
506
+ end
507
+ alias_method :rstrip, :strip_chars_end
508
+
509
+ # Remove prefix.
510
+ #
511
+ # The prefix will be removed from the string exactly once, if found.
512
+ #
513
+ # @param prefix [String]
514
+ # The prefix to be removed.
515
+ #
516
+ # @return [Expr]
517
+ #
518
+ # @example
519
+ # df = Polars::DataFrame.new({"a" => ["foobar", "foofoobar", "foo", "bar"]})
520
+ # df.with_columns(Polars.col("a").str.strip_prefix("foo").alias("stripped"))
521
+ # # =>
522
+ # # shape: (4, 2)
523
+ # # ┌───────────┬──────────┐
524
+ # # │ a ┆ stripped │
525
+ # # │ --- ┆ --- │
526
+ # # │ str ┆ str │
527
+ # # ╞═══════════╪══════════╡
528
+ # # │ foobar ┆ bar │
529
+ # # │ foofoobar ┆ foobar │
530
+ # # │ foo ┆ │
531
+ # # │ bar ┆ bar │
532
+ # # └───────────┴──────────┘
533
+ def strip_prefix(prefix)
534
+ prefix = Utils.parse_into_expression(prefix, str_as_lit: true)
535
+ Utils.wrap_expr(_rbexpr.str_strip_prefix(prefix))
536
+ end
537
+
538
+ # Remove suffix.
539
+ #
540
+ # The suffix will be removed from the string exactly once, if found.
541
+ #
542
+ #
543
+ # @param suffix [String]
544
+ # The suffix to be removed.
545
+ #
546
+ # @return [Expr]
547
+ #
548
+ # @example
549
+ # df = Polars::DataFrame.new({"a" => ["foobar", "foobarbar", "foo", "bar"]})
550
+ # df.with_columns(Polars.col("a").str.strip_suffix("bar").alias("stripped"))
551
+ # # =>
552
+ # # shape: (4, 2)
553
+ # # ┌───────────┬──────────┐
554
+ # # │ a ┆ stripped │
555
+ # # │ --- ┆ --- │
556
+ # # │ str ┆ str │
557
+ # # ╞═══════════╪══════════╡
558
+ # # │ foobar ┆ foo │
559
+ # # │ foobarbar ┆ foobar │
560
+ # # │ foo ┆ foo │
561
+ # # │ bar ┆ │
562
+ # # └───────────┴──────────┘
563
+ def strip_suffix(suffix)
564
+ suffix = Utils.parse_into_expression(suffix, str_as_lit: true)
565
+ Utils.wrap_expr(_rbexpr.str_strip_suffix(suffix))
566
+ end
567
+
568
+ # Pad the start of the string until it reaches the given length.
569
+ #
570
+ # @param length [Integer]
571
+ # Pad the string until it reaches this length. Strings with length equal to
572
+ # or greater than this value are returned as-is.
573
+ # @param fill_char [String]
574
+ # The character to pad the string with.
575
+ #
576
+ # @return [Expr]
577
+ #
578
+ # @example
579
+ # df = Polars::DataFrame.new({"a": ["cow", "monkey", "hippopotamus", nil]})
580
+ # df.with_columns(padded: Polars.col("a").str.pad_start(8, "*"))
581
+ # # =>
582
+ # # shape: (4, 2)
583
+ # # ┌──────────────┬──────────────┐
584
+ # # │ a ┆ padded │
585
+ # # │ --- ┆ --- │
586
+ # # │ str ┆ str │
587
+ # # ╞══════════════╪══════════════╡
588
+ # # │ cow ┆ *****cow │
589
+ # # │ monkey ┆ **monkey │
590
+ # # │ hippopotamus ┆ hippopotamus │
591
+ # # │ null ┆ null │
592
+ # # └──────────────┴──────────────┘
593
+ def pad_start(length, fill_char = " ")
594
+ Utils.wrap_expr(_rbexpr.str_pad_start(length, fill_char))
595
+ end
596
+ alias_method :rjust, :pad_start
597
+
598
+ # Pad the end of the string until it reaches the given length.
599
+ #
600
+ # @param length [Integer]
601
+ # Pad the string until it reaches this length. Strings with length equal to
602
+ # or greater than this value are returned as-is.
603
+ # @param fill_char [String]
604
+ # The character to pad the string with.
605
+ #
606
+ # @return [Expr]
607
+ #
608
+ # @example
609
+ # df = Polars::DataFrame.new({"a": ["cow", "monkey", "hippopotamus", nil]})
610
+ # df.with_columns(padded: Polars.col("a").str.pad_end(8, "*"))
611
+ # # =>
612
+ # # shape: (4, 2)
613
+ # # ┌──────────────┬──────────────┐
614
+ # # │ a ┆ padded │
615
+ # # │ --- ┆ --- │
616
+ # # │ str ┆ str │
617
+ # # ╞══════════════╪══════════════╡
618
+ # # │ cow ┆ cow***** │
619
+ # # │ monkey ┆ monkey** │
620
+ # # │ hippopotamus ┆ hippopotamus │
621
+ # # │ null ┆ null │
622
+ # # └──────────────┴──────────────┘
623
+ def pad_end(length, fill_char = " ")
624
+ Utils.wrap_expr(_rbexpr.str_pad_end(length, fill_char))
625
+ end
626
+ alias_method :ljust, :pad_end
627
+
628
+ # Fills the string with zeroes.
629
+ #
630
+ # Return a copy of the string left filled with ASCII '0' digits to make a string
631
+ # of length width.
632
+ #
633
+ # A leading sign prefix ('+'/'-') is handled by inserting the padding after the
634
+ # sign character rather than before. The original string is returned if width is
635
+ # less than or equal to `s.length`.
636
+ #
637
+ # @param length [Integer]
638
+ # Fill the value up to this length
639
+ #
640
+ # @return [Expr]
641
+ #
642
+ # @example
643
+ # df = Polars::DataFrame.new({"a" => [-1, 123, 999999, nil]})
644
+ # df.with_columns(Polars.col("a").cast(Polars::String).str.zfill(4).alias("zfill"))
645
+ # # =>
646
+ # # shape: (4, 2)
647
+ # # ┌────────┬────────┐
648
+ # # │ a ┆ zfill │
649
+ # # │ --- ┆ --- │
650
+ # # │ i64 ┆ str │
651
+ # # ╞════════╪════════╡
652
+ # # │ -1 ┆ -001 │
653
+ # # │ 123 ┆ 0123 │
654
+ # # │ 999999 ┆ 999999 │
655
+ # # │ null ┆ null │
656
+ # # └────────┴────────┘
657
+ def zfill(length)
658
+ length = Utils.parse_into_expression(length)
659
+ Utils.wrap_expr(_rbexpr.str_zfill(length))
660
+ end
661
+
662
+ # Check if string contains a substring that matches a regex.
663
+ #
664
+ # @param pattern [String]
665
+ # A valid regex pattern.
666
+ # @param literal [Boolean]
667
+ # Treat pattern as a literal string.
668
+ #
669
+ # @return [Expr]
670
+ #
671
+ # @example
672
+ # df = Polars::DataFrame.new({"a" => ["Crab", "cat and dog", "rab$bit", nil]})
673
+ # df.select(
674
+ # [
675
+ # Polars.col("a"),
676
+ # Polars.col("a").str.contains("cat|bit").alias("regex"),
677
+ # Polars.col("a").str.contains("rab$", literal: true).alias("literal")
678
+ # ]
679
+ # )
680
+ # # =>
681
+ # # shape: (4, 3)
682
+ # # ┌─────────────┬───────┬─────────┐
683
+ # # │ a ┆ regex ┆ literal │
684
+ # # │ --- ┆ --- ┆ --- │
685
+ # # │ str ┆ bool ┆ bool │
686
+ # # ╞═════════════╪═══════╪═════════╡
687
+ # # │ Crab ┆ false ┆ false │
688
+ # # │ cat and dog ┆ true ┆ false │
689
+ # # │ rab$bit ┆ true ┆ true │
690
+ # # │ null ┆ null ┆ null │
691
+ # # └─────────────┴───────┴─────────┘
692
+ def contains(pattern, literal: false, strict: true)
693
+ pattern = Utils.parse_into_expression(pattern, str_as_lit: true)
694
+ Utils.wrap_expr(_rbexpr.str_contains(pattern, literal, strict))
695
+ end
696
+
697
+ # Check if string values end with a substring.
698
+ #
699
+ # @param sub [String]
700
+ # Suffix substring.
701
+ #
702
+ # @return [Expr]
703
+ #
704
+ # @example
705
+ # df = Polars::DataFrame.new({"fruits" => ["apple", "mango", nil]})
706
+ # df.with_column(
707
+ # Polars.col("fruits").str.ends_with("go").alias("has_suffix")
708
+ # )
709
+ # # =>
710
+ # # shape: (3, 2)
711
+ # # ┌────────┬────────────┐
712
+ # # │ fruits ┆ has_suffix │
713
+ # # │ --- ┆ --- │
714
+ # # │ str ┆ bool │
715
+ # # ╞════════╪════════════╡
716
+ # # │ apple ┆ false │
717
+ # # │ mango ┆ true │
718
+ # # │ null ┆ null │
719
+ # # └────────┴────────────┘
720
+ #
721
+ # @example Using `ends_with` as a filter condition:
722
+ # df.filter(Polars.col("fruits").str.ends_with("go"))
723
+ # # =>
724
+ # # shape: (1, 1)
725
+ # # ┌────────┐
726
+ # # │ fruits │
727
+ # # │ --- │
728
+ # # │ str │
729
+ # # ╞════════╡
730
+ # # │ mango │
731
+ # # └────────┘
732
+ def ends_with(sub)
733
+ sub = Utils.parse_into_expression(sub, str_as_lit: true)
734
+ Utils.wrap_expr(_rbexpr.str_ends_with(sub))
735
+ end
736
+
737
+ # Check if string values start with a substring.
738
+ #
739
+ # @param sub [String]
740
+ # Prefix substring.
741
+ #
742
+ # @return [Expr]
743
+ #
744
+ # @example
745
+ # df = Polars::DataFrame.new({"fruits" => ["apple", "mango", nil]})
746
+ # df.with_column(
747
+ # Polars.col("fruits").str.starts_with("app").alias("has_prefix")
748
+ # )
749
+ # # =>
750
+ # # shape: (3, 2)
751
+ # # ┌────────┬────────────┐
752
+ # # │ fruits ┆ has_prefix │
753
+ # # │ --- ┆ --- │
754
+ # # │ str ┆ bool │
755
+ # # ╞════════╪════════════╡
756
+ # # │ apple ┆ true │
757
+ # # │ mango ┆ false │
758
+ # # │ null ┆ null │
759
+ # # └────────┴────────────┘
760
+ #
761
+ # @example Using `starts_with` as a filter condition:
762
+ # df.filter(Polars.col("fruits").str.starts_with("app"))
763
+ # # =>
764
+ # # shape: (1, 1)
765
+ # # ┌────────┐
766
+ # # │ fruits │
767
+ # # │ --- │
768
+ # # │ str │
769
+ # # ╞════════╡
770
+ # # │ apple │
771
+ # # └────────┘
772
+ def starts_with(sub)
773
+ sub = Utils.parse_into_expression(sub, str_as_lit: true)
774
+ Utils.wrap_expr(_rbexpr.str_starts_with(sub))
775
+ end
776
+
777
+ # Parse string values as JSON.
778
+ #
779
+ # Throw errors if encounter invalid JSON strings.
780
+ #
781
+ # @param dtype [Object]
782
+ # The dtype to cast the extracted value to. If nil, the dtype will be
783
+ # inferred from the JSON value.
784
+ #
785
+ # @return [Expr]
786
+ #
787
+ # @example
788
+ # df = Polars::DataFrame.new(
789
+ # {"json" => ['{"a":1, "b": true}', nil, '{"a":2, "b": false}']}
790
+ # )
791
+ # dtype = Polars::Struct.new([Polars::Field.new("a", Polars::Int64), Polars::Field.new("b", Polars::Boolean)])
792
+ # df.select(Polars.col("json").str.json_decode(dtype))
793
+ # # =>
794
+ # # shape: (3, 1)
795
+ # # ┌───────────┐
796
+ # # │ json │
797
+ # # │ --- │
798
+ # # │ struct[2] │
799
+ # # ╞═══════════╡
800
+ # # │ {1,true} │
801
+ # # │ null │
802
+ # # │ {2,false} │
803
+ # # └───────────┘
804
+ def json_decode(dtype = nil, infer_schema_length: 100)
805
+ if !dtype.nil?
806
+ dtype = Utils.rb_type_to_dtype(dtype)
807
+ end
808
+ Utils.wrap_expr(_rbexpr.str_json_decode(dtype, infer_schema_length))
809
+ end
810
+ alias_method :json_extract, :json_decode
811
+
812
+ # Extract the first match of json string with provided JSONPath expression.
813
+ #
814
+ # Throw errors if encounter invalid json strings.
815
+ # All return value will be casted to Utf8 regardless of the original value.
816
+ #
817
+ # Documentation on JSONPath standard can be found
818
+ # [here](https://goessner.net/articles/JsonPath/).
819
+ #
820
+ # @param json_path [String]
821
+ # A valid JSON path query string.
822
+ #
823
+ # @return [Expr]
824
+ #
825
+ # @example
826
+ # df = Polars::DataFrame.new(
827
+ # {"json_val" => ['{"a":"1"}', nil, '{"a":2}', '{"a":2.1}', '{"a":true}']}
828
+ # )
829
+ # df.select(Polars.col("json_val").str.json_path_match("$.a"))
830
+ # # =>
831
+ # # shape: (5, 1)
832
+ # # ┌──────────┐
833
+ # # │ json_val │
834
+ # # │ --- │
835
+ # # │ str │
836
+ # # ╞══════════╡
837
+ # # │ 1 │
838
+ # # │ null │
839
+ # # │ 2 │
840
+ # # │ 2.1 │
841
+ # # │ true │
842
+ # # └──────────┘
843
+ def json_path_match(json_path)
844
+ json_path = Utils.parse_into_expression(json_path, str_as_lit: true)
845
+ Utils.wrap_expr(_rbexpr.str_json_path_match(json_path))
846
+ end
847
+
848
+ # Decode a value using the provided encoding.
849
+ #
850
+ # @param encoding ["hex", "base64"]
851
+ # The encoding to use.
852
+ # @param strict [Boolean]
853
+ # How to handle invalid inputs:
854
+ #
855
+ # - `true`: An error will be thrown if unable to decode a value.
856
+ # - `false`: Unhandled values will be replaced with `nil`.
857
+ #
858
+ # @return [Expr]
859
+ #
860
+ # @example
861
+ # df = Polars::DataFrame.new({"encoded" => ["666f6f", "626172", nil]})
862
+ # df.select(Polars.col("encoded").str.decode("hex"))
863
+ # # =>
864
+ # # shape: (3, 1)
865
+ # # ┌─────────┐
866
+ # # │ encoded │
867
+ # # │ --- │
868
+ # # │ binary │
869
+ # # ╞═════════╡
870
+ # # │ b"foo" │
871
+ # # │ b"bar" │
872
+ # # │ null │
873
+ # # └─────────┘
874
+ def decode(encoding, strict: true)
875
+ if encoding == "hex"
876
+ Utils.wrap_expr(_rbexpr.str_hex_decode(strict))
877
+ elsif encoding == "base64"
878
+ Utils.wrap_expr(_rbexpr.str_base64_decode(strict))
879
+ else
880
+ raise ArgumentError, "encoding must be one of {{'hex', 'base64'}}, got #{encoding}"
881
+ end
882
+ end
883
+
884
+ # Encode a value using the provided encoding.
885
+ #
886
+ # @param encoding ["hex", "base64"]
887
+ # The encoding to use.
888
+ #
889
+ # @return [Expr]
890
+ #
891
+ # @example
892
+ # df = Polars::DataFrame.new({"strings" => ["foo", "bar", nil]})
893
+ # df.select(Polars.col("strings").str.encode("hex"))
894
+ # # =>
895
+ # # shape: (3, 1)
896
+ # # ┌─────────┐
897
+ # # │ strings │
898
+ # # │ --- │
899
+ # # │ str │
900
+ # # ╞═════════╡
901
+ # # │ 666f6f │
902
+ # # │ 626172 │
903
+ # # │ null │
904
+ # # └─────────┘
905
+ def encode(encoding)
906
+ if encoding == "hex"
907
+ Utils.wrap_expr(_rbexpr.str_hex_encode)
908
+ elsif encoding == "base64"
909
+ Utils.wrap_expr(_rbexpr.str_base64_encode)
910
+ else
911
+ raise ArgumentError, "encoding must be one of {{'hex', 'base64'}}, got #{encoding}"
912
+ end
913
+ end
914
+
915
+ # Extract the target capture group from provided patterns.
916
+ #
917
+ # @param pattern [String]
918
+ # A valid regex pattern
919
+ # @param group_index [Integer]
920
+ # Index of the targeted capture group.
921
+ # Group 0 mean the whole pattern, first group begin at index 1
922
+ # Default to the first capture group
923
+ #
924
+ # @return [Expr]
925
+ #
926
+ # @example
927
+ # df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
928
+ # df.select(
929
+ # [
930
+ # Polars.col("foo").str.extract('(\d+)')
931
+ # ]
932
+ # )
933
+ # # =>
934
+ # # shape: (2, 1)
935
+ # # ┌─────┐
936
+ # # │ foo │
937
+ # # │ --- │
938
+ # # │ str │
939
+ # # ╞═════╡
940
+ # # │ 123 │
941
+ # # │ 678 │
942
+ # # └─────┘
943
+ def extract(pattern, group_index: 1)
944
+ pattern = Utils.parse_into_expression(pattern, str_as_lit: true)
945
+ Utils.wrap_expr(_rbexpr.str_extract(pattern, group_index))
946
+ end
947
+
948
+ # Extracts all matches for the given regex pattern.
949
+ #
950
+ # Extracts each successive non-overlapping regex match in an individual string as
951
+ # an array.
952
+ #
953
+ # @param pattern [String]
954
+ # A valid regex pattern
955
+ #
956
+ # @return [Expr]
957
+ #
958
+ # @example
959
+ # df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
960
+ # df.select(
961
+ # [
962
+ # Polars.col("foo").str.extract_all('(\d+)').alias("extracted_nrs")
963
+ # ]
964
+ # )
965
+ # # =>
966
+ # # shape: (2, 1)
967
+ # # ┌────────────────┐
968
+ # # │ extracted_nrs │
969
+ # # │ --- │
970
+ # # │ list[str] │
971
+ # # ╞════════════════╡
972
+ # # │ ["123", "45"] │
973
+ # # │ ["678", "910"] │
974
+ # # └────────────────┘
975
+ def extract_all(pattern)
976
+ pattern = Utils.parse_into_expression(pattern, str_as_lit: true)
977
+ Utils.wrap_expr(_rbexpr.str_extract_all(pattern))
978
+ end
979
+
980
+ # Extract all capture groups for the given regex pattern.
981
+ #
982
+ # @param pattern [String]
983
+ # A valid regular expression pattern containing at least one capture group,
984
+ # compatible with the [regex crate](https://docs.rs/regex/latest/regex/).
985
+ #
986
+ # @return [Expr]
987
+ #
988
+ # @example
989
+ # df = Polars::DataFrame.new(
990
+ # {
991
+ # "url": [
992
+ # "http://vote.com/ballon_dor?candidate=messi&ref=python",
993
+ # "http://vote.com/ballon_dor?candidate=weghorst&ref=polars",
994
+ # "http://vote.com/ballon_dor?error=404&ref=rust"
995
+ # ]
996
+ # }
997
+ # )
998
+ # pattern = /candidate=(?<candidate>\w+)&ref=(?<ref>\w+)/.to_s
999
+ # df.select(captures: Polars.col("url").str.extract_groups(pattern)).unnest(
1000
+ # "captures"
1001
+ # )
1002
+ # # =>
1003
+ # # shape: (3, 2)
1004
+ # # ┌───────────┬────────┐
1005
+ # # │ candidate ┆ ref │
1006
+ # # │ --- ┆ --- │
1007
+ # # │ str ┆ str │
1008
+ # # ╞═══════════╪════════╡
1009
+ # # │ messi ┆ python │
1010
+ # # │ weghorst ┆ polars │
1011
+ # # │ null ┆ null │
1012
+ # # └───────────┴────────┘
1013
+ #
1014
+ # @example Unnamed groups have their numerical position converted to a string:
1015
+ # pattern = /candidate=(\w+)&ref=(\w+)/.to_s
1016
+ # (
1017
+ # df.with_columns(
1018
+ # captures: Polars.col("url").str.extract_groups(pattern)
1019
+ # ).with_columns(name: Polars.col("captures").struct["1"].str.to_uppercase)
1020
+ # )
1021
+ # # =>
1022
+ # # shape: (3, 3)
1023
+ # # ┌─────────────────────────────────┬───────────────────────┬──────────┐
1024
+ # # │ url ┆ captures ┆ name │
1025
+ # # │ --- ┆ --- ┆ --- │
1026
+ # # │ str ┆ struct[2] ┆ str │
1027
+ # # ╞═════════════════════════════════╪═══════════════════════╪══════════╡
1028
+ # # │ http://vote.com/ballon_dor?can… ┆ {"messi","python"} ┆ MESSI │
1029
+ # # │ http://vote.com/ballon_dor?can… ┆ {"weghorst","polars"} ┆ WEGHORST │
1030
+ # # │ http://vote.com/ballon_dor?err… ┆ {null,null} ┆ null │
1031
+ # # └─────────────────────────────────┴───────────────────────┴──────────┘
1032
+ def extract_groups(pattern)
1033
+ Utils.wrap_expr(_rbexpr.str_extract_groups(pattern))
1034
+ end
1035
+
1036
+ # Count all successive non-overlapping regex matches.
1037
+ #
1038
+ # @param pattern [String]
1039
+ # A valid regex pattern
1040
+ #
1041
+ # @return [Expr]
1042
+ #
1043
+ # @example
1044
+ # df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
1045
+ # df.select(
1046
+ # [
1047
+ # Polars.col("foo").str.count_match('\d').alias("count_digits")
1048
+ # ]
1049
+ # )
1050
+ # # =>
1051
+ # # shape: (2, 1)
1052
+ # # ┌──────────────┐
1053
+ # # │ count_digits │
1054
+ # # │ --- │
1055
+ # # │ u32 │
1056
+ # # ╞══════════════╡
1057
+ # # │ 5 │
1058
+ # # │ 6 │
1059
+ # # └──────────────┘
1060
+ def count_matches(pattern, literal: false)
1061
+ pattern = Utils.parse_into_expression(pattern, str_as_lit: true)
1062
+ Utils.wrap_expr(_rbexpr.str_count_matches(pattern, literal))
1063
+ end
1064
+ alias_method :count_match, :count_matches
1065
+
1066
+ # Split the string by a substring.
1067
+ #
1068
+ # @param by [String]
1069
+ # Substring to split by.
1070
+ # @param inclusive [Boolean]
1071
+ # If true, include the split character/string in the results.
1072
+ #
1073
+ # @return [Expr]
1074
+ #
1075
+ # @example
1076
+ # df = Polars::DataFrame.new({"s" => ["foo bar", "foo-bar", "foo bar baz"]})
1077
+ # df.select(Polars.col("s").str.split(" "))
1078
+ # # =>
1079
+ # # shape: (3, 1)
1080
+ # # ┌───────────────────────┐
1081
+ # # │ s │
1082
+ # # │ --- │
1083
+ # # │ list[str] │
1084
+ # # ╞═══════════════════════╡
1085
+ # # │ ["foo", "bar"] │
1086
+ # # │ ["foo-bar"] │
1087
+ # # │ ["foo", "bar", "baz"] │
1088
+ # # └───────────────────────┘
1089
+ def split(by, inclusive: false)
1090
+ by = Utils.parse_into_expression(by, str_as_lit: true)
1091
+ if inclusive
1092
+ return Utils.wrap_expr(_rbexpr.str_split_inclusive(by))
1093
+ end
1094
+ Utils.wrap_expr(_rbexpr.str_split(by))
1095
+ end
1096
+
1097
+ # Split the string by a substring using `n` splits.
1098
+ #
1099
+ # Results in a struct of `n+1` fields.
1100
+ #
1101
+ # If it cannot make `n` splits, the remaining field elements will be null.
1102
+ #
1103
+ # @param by [String]
1104
+ # Substring to split by.
1105
+ # @param n [Integer]
1106
+ # Number of splits to make.
1107
+ # @param inclusive [Boolean]
1108
+ # If true, include the split character/string in the results.
1109
+ #
1110
+ # @return [Expr]
1111
+ #
1112
+ # @example
1113
+ # df = Polars::DataFrame.new({"x" => ["a_1", nil, "c", "d_4"]})
1114
+ # df.select(
1115
+ # [
1116
+ # Polars.col("x").str.split_exact("_", 1).alias("fields")
1117
+ # ]
1118
+ # )
1119
+ # # =>
1120
+ # # shape: (4, 1)
1121
+ # # ┌─────────────┐
1122
+ # # │ fields │
1123
+ # # │ --- │
1124
+ # # │ struct[2] │
1125
+ # # ╞═════════════╡
1126
+ # # │ {"a","1"} │
1127
+ # # │ {null,null} │
1128
+ # # │ {"c",null} │
1129
+ # # │ {"d","4"} │
1130
+ # # └─────────────┘
1131
+ def split_exact(by, n, inclusive: false)
1132
+ by = Utils.parse_into_expression(by, str_as_lit: true)
1133
+ if inclusive
1134
+ Utils.wrap_expr(_rbexpr.str_split_exact_inclusive(by, n))
1135
+ else
1136
+ Utils.wrap_expr(_rbexpr.str_split_exact(by, n))
1137
+ end
1138
+ end
1139
+
1140
+ # Split the string by a substring, restricted to returning at most `n` items.
1141
+ #
1142
+ # If the number of possible splits is less than `n-1`, the remaining field
1143
+ # elements will be null. If the number of possible splits is `n-1` or greater,
1144
+ # the last (nth) substring will contain the remainder of the string.
1145
+ #
1146
+ # @param by [String]
1147
+ # Substring to split by.
1148
+ # @param n [Integer]
1149
+ # Max number of items to return.
1150
+ #
1151
+ # @return [Expr]
1152
+ #
1153
+ # @example
1154
+ # df = Polars::DataFrame.new({"s" => ["foo bar", nil, "foo-bar", "foo bar baz"]})
1155
+ # df.select(Polars.col("s").str.splitn(" ", 2).alias("fields"))
1156
+ # # =>
1157
+ # # shape: (4, 1)
1158
+ # # ┌───────────────────┐
1159
+ # # │ fields │
1160
+ # # │ --- │
1161
+ # # │ struct[2] │
1162
+ # # ╞═══════════════════╡
1163
+ # # │ {"foo","bar"} │
1164
+ # # │ {null,null} │
1165
+ # # │ {"foo-bar",null} │
1166
+ # # │ {"foo","bar baz"} │
1167
+ # # └───────────────────┘
1168
+ def splitn(by, n)
1169
+ by = Utils.parse_into_expression(by, str_as_lit: true)
1170
+ Utils.wrap_expr(_rbexpr.str_splitn(by, n))
1171
+ end
1172
+
1173
+ # Replace first matching regex/literal substring with a new string value.
1174
+ #
1175
+ # @param pattern [String]
1176
+ # Regex pattern.
1177
+ # @param value [String]
1178
+ # Replacement string.
1179
+ # @param literal [Boolean]
1180
+ # Treat pattern as a literal string.
1181
+ #
1182
+ # @return [Expr]
1183
+ #
1184
+ # @example
1185
+ # df = Polars::DataFrame.new({"id" => [1, 2], "text" => ["123abc", "abc456"]})
1186
+ # df.with_column(
1187
+ # Polars.col("text").str.replace('abc\b', "ABC")
1188
+ # )
1189
+ # # =>
1190
+ # # shape: (2, 2)
1191
+ # # ┌─────┬────────┐
1192
+ # # │ id ┆ text │
1193
+ # # │ --- ┆ --- │
1194
+ # # │ i64 ┆ str │
1195
+ # # ╞═════╪════════╡
1196
+ # # │ 1 ┆ 123ABC │
1197
+ # # │ 2 ┆ abc456 │
1198
+ # # └─────┴────────┘
1199
+ def replace(pattern, value, literal: false, n: 1)
1200
+ pattern = Utils.parse_into_expression(pattern, str_as_lit: true)
1201
+ value = Utils.parse_into_expression(value, str_as_lit: true)
1202
+ Utils.wrap_expr(_rbexpr.str_replace_n(pattern, value, literal, n))
1203
+ end
1204
+
1205
+ # Replace all matching regex/literal substrings with a new string value.
1206
+ #
1207
+ # @param pattern [String]
1208
+ # Regex pattern.
1209
+ # @param value [String]
1210
+ # Replacement string.
1211
+ # @param literal [Boolean]
1212
+ # Treat pattern as a literal string.
1213
+ #
1214
+ # @return [Expr]
1215
+ #
1216
+ # @example
1217
+ # df = Polars::DataFrame.new({"id" => [1, 2], "text" => ["abcabc", "123a123"]})
1218
+ # df.with_column(Polars.col("text").str.replace_all("a", "-"))
1219
+ # # =>
1220
+ # # shape: (2, 2)
1221
+ # # ┌─────┬─────────┐
1222
+ # # │ id ┆ text │
1223
+ # # │ --- ┆ --- │
1224
+ # # │ i64 ┆ str │
1225
+ # # ╞═════╪═════════╡
1226
+ # # │ 1 ┆ -bc-bc │
1227
+ # # │ 2 ┆ 123-123 │
1228
+ # # └─────┴─────────┘
1229
+ def replace_all(pattern, value, literal: false)
1230
+ pattern = Utils.parse_into_expression(pattern, str_as_lit: true)
1231
+ value = Utils.parse_into_expression(value, str_as_lit: true)
1232
+ Utils.wrap_expr(_rbexpr.str_replace_all(pattern, value, literal))
1233
+ end
1234
+
1235
+ # Returns string values in reversed order.
1236
+ #
1237
+ # @return [Expr]
1238
+ #
1239
+ # @example
1240
+ # df = Polars::DataFrame.new({"text" => ["foo", "bar", "man\u0303ana"]})
1241
+ # df.with_columns(Polars.col("text").str.reverse.alias("reversed"))
1242
+ # # =>
1243
+ # # shape: (3, 2)
1244
+ # # ┌────────┬──────────┐
1245
+ # # │ text ┆ reversed │
1246
+ # # │ --- ┆ --- │
1247
+ # # │ str ┆ str │
1248
+ # # ╞════════╪══════════╡
1249
+ # # │ foo ┆ oof │
1250
+ # # │ bar ┆ rab │
1251
+ # # │ mañana ┆ anañam │
1252
+ # # └────────┴──────────┘
1253
+ def reverse
1254
+ Utils.wrap_expr(_rbexpr.str_reverse)
1255
+ end
1256
+
1257
+ # Create subslices of the string values of a Utf8 Series.
1258
+ #
1259
+ # @param offset [Integer]
1260
+ # Start index. Negative indexing is supported.
1261
+ # @param length [Integer]
1262
+ # Length of the slice. If set to `nil` (default), the slice is taken to the
1263
+ # end of the string.
1264
+ #
1265
+ # @return [Expr]
1266
+ #
1267
+ # @example
1268
+ # df = Polars::DataFrame.new({"s" => ["pear", nil, "papaya", "dragonfruit"]})
1269
+ # df.with_column(
1270
+ # Polars.col("s").str.slice(-3).alias("s_sliced")
1271
+ # )
1272
+ # # =>
1273
+ # # shape: (4, 2)
1274
+ # # ┌─────────────┬──────────┐
1275
+ # # │ s ┆ s_sliced │
1276
+ # # │ --- ┆ --- │
1277
+ # # │ str ┆ str │
1278
+ # # ╞═════════════╪══════════╡
1279
+ # # │ pear ┆ ear │
1280
+ # # │ null ┆ null │
1281
+ # # │ papaya ┆ aya │
1282
+ # # │ dragonfruit ┆ uit │
1283
+ # # └─────────────┴──────────┘
1284
+ def slice(offset, length = nil)
1285
+ offset = Utils.parse_into_expression(offset)
1286
+ length = Utils.parse_into_expression(length)
1287
+ Utils.wrap_expr(_rbexpr.str_slice(offset, length))
1288
+ end
1289
+
1290
+ # Convert an Utf8 column into an Int64 column with base radix.
1291
+ #
1292
+ # @param base [Integer]
1293
+ # Positive integer which is the base of the string we are parsing.
1294
+ # Default: 10.
1295
+ # @param strict [Boolean]
1296
+ # Bool, default=true will raise any ParseError or overflow as ComputeError.
1297
+ # false silently convert to Null.
1298
+ #
1299
+ # @return [Expr]
1300
+ #
1301
+ # @example
1302
+ # df = Polars::DataFrame.new({"bin" => ["110", "101", "010", "invalid"]})
1303
+ # df.with_columns(Polars.col("bin").str.to_integer(base: 2, strict: false).alias("parsed"))
1304
+ # # =>
1305
+ # # shape: (4, 2)
1306
+ # # ┌─────────┬────────┐
1307
+ # # │ bin ┆ parsed │
1308
+ # # │ --- ┆ --- │
1309
+ # # │ str ┆ i64 │
1310
+ # # ╞═════════╪════════╡
1311
+ # # │ 110 ┆ 6 │
1312
+ # # │ 101 ┆ 5 │
1313
+ # # │ 010 ┆ 2 │
1314
+ # # │ invalid ┆ null │
1315
+ # # └─────────┴────────┘
1316
+ #
1317
+ # @example
1318
+ # df = Polars::DataFrame.new({"hex" => ["fa1e", "ff00", "cafe", nil]})
1319
+ # df.with_columns(Polars.col("hex").str.to_integer(base: 16, strict: true).alias("parsed"))
1320
+ # # =>
1321
+ # # shape: (4, 2)
1322
+ # # ┌──────┬────────┐
1323
+ # # │ hex ┆ parsed │
1324
+ # # │ --- ┆ --- │
1325
+ # # │ str ┆ i64 │
1326
+ # # ╞══════╪════════╡
1327
+ # # │ fa1e ┆ 64030 │
1328
+ # # │ ff00 ┆ 65280 │
1329
+ # # │ cafe ┆ 51966 │
1330
+ # # │ null ┆ null │
1331
+ # # └──────┴────────┘
1332
+ def to_integer(base: 10, strict: true)
1333
+ base = Utils.parse_into_expression(base, str_as_lit: false)
1334
+ Utils.wrap_expr(_rbexpr.str_to_integer(base, strict))
1335
+ end
1336
+
1337
+ # Parse integers with base radix from strings.
1338
+ #
1339
+ # By default base 2. ParseError/Overflows become Nulls.
1340
+ #
1341
+ # @param radix [Integer]
1342
+ # Positive integer which is the base of the string we are parsing.
1343
+ # Default: 2.
1344
+ # @param strict [Boolean]
1345
+ # Bool, Default=true will raise any ParseError or overflow as ComputeError.
1346
+ # False silently convert to Null.
1347
+ #
1348
+ # @return [Expr]
1349
+ #
1350
+ # @example
1351
+ # df = Polars::DataFrame.new({"bin" => ["110", "101", "010", "invalid"]})
1352
+ # df.select(Polars.col("bin").str.parse_int(2, strict: false))
1353
+ # # =>
1354
+ # # shape: (4, 1)
1355
+ # # ┌──────┐
1356
+ # # │ bin │
1357
+ # # │ --- │
1358
+ # # │ i32 │
1359
+ # # ╞══════╡
1360
+ # # │ 6 │
1361
+ # # │ 5 │
1362
+ # # │ 2 │
1363
+ # # │ null │
1364
+ # # └──────┘
1365
+ def parse_int(radix = 2, strict: true)
1366
+ to_integer(base: 2, strict: strict).cast(Int32, strict: strict)
1367
+ end
1368
+
1369
+ # Use the aho-corasick algorithm to find matches.
1370
+ #
1371
+ # This version determines if any of the patterns find a match.
1372
+ #
1373
+ # @param patterns [String]
1374
+ # String patterns to search.
1375
+ # @param ascii_case_insensitive [Boolean]
1376
+ # Enable ASCII-aware case insensitive matching.
1377
+ # When this option is enabled, searching will be performed without respect
1378
+ # to case for ASCII letters (a-z and A-Z) only.
1379
+ #
1380
+ # @return [Expr]
1381
+ #
1382
+ # @example
1383
+ # df = Polars::DataFrame.new(
1384
+ # {
1385
+ # "lyrics": [
1386
+ # "Everybody wants to rule the world",
1387
+ # "Tell me what you want, what you really really want",
1388
+ # "Can you feel the love tonight"
1389
+ # ]
1390
+ # }
1391
+ # )
1392
+ # df.with_columns(
1393
+ # Polars.col("lyrics").str.contains_any(["you", "me"]).alias("contains_any")
1394
+ # )
1395
+ # # =>
1396
+ # # shape: (3, 2)
1397
+ # # ┌─────────────────────────────────┬──────────────┐
1398
+ # # │ lyrics ┆ contains_any │
1399
+ # # │ --- ┆ --- │
1400
+ # # │ str ┆ bool │
1401
+ # # ╞═════════════════════════════════╪══════════════╡
1402
+ # # │ Everybody wants to rule the wo… ┆ false │
1403
+ # # │ Tell me what you want, what yo… ┆ true │
1404
+ # # │ Can you feel the love tonight ┆ true │
1405
+ # # └─────────────────────────────────┴──────────────┘
1406
+ def contains_any(patterns, ascii_case_insensitive: false)
1407
+ patterns = Utils.parse_into_expression(patterns, str_as_lit: false, list_as_series: true)
1408
+ Utils.wrap_expr(
1409
+ _rbexpr.str_contains_any(patterns, ascii_case_insensitive)
1410
+ )
1411
+ end
1412
+
1413
+ # Use the aho-corasick algorithm to replace many matches.
1414
+ #
1415
+ # @param patterns [String]
1416
+ # String patterns to search and replace.
1417
+ # @param replace_with [String]
1418
+ # Strings to replace where a pattern was a match.
1419
+ # This can be broadcasted. So it supports many:one and many:many.
1420
+ # @param ascii_case_insensitive [Boolean]
1421
+ # Enable ASCII-aware case insensitive matching.
1422
+ # When this option is enabled, searching will be performed without respect
1423
+ # to case for ASCII letters (a-z and A-Z) only.
1424
+ #
1425
+ # @return [Expr]
1426
+ #
1427
+ # @example
1428
+ # df = Polars::DataFrame.new(
1429
+ # {
1430
+ # "lyrics": [
1431
+ # "Everybody wants to rule the world",
1432
+ # "Tell me what you want, what you really really want",
1433
+ # "Can you feel the love tonight"
1434
+ # ]
1435
+ # }
1436
+ # )
1437
+ # df.with_columns(
1438
+ # Polars.col("lyrics")
1439
+ # .str.replace_many(
1440
+ # ["me", "you", "they"],
1441
+ # ""
1442
+ # )
1443
+ # .alias("removes_pronouns")
1444
+ # )
1445
+ # # =>
1446
+ # # shape: (3, 2)
1447
+ # # ┌─────────────────────────────────┬─────────────────────────────────┐
1448
+ # # │ lyrics ┆ removes_pronouns │
1449
+ # # │ --- ┆ --- │
1450
+ # # │ str ┆ str │
1451
+ # # ╞═════════════════════════════════╪═════════════════════════════════╡
1452
+ # # │ Everybody wants to rule the wo… ┆ Everybody wants to rule the wo… │
1453
+ # # │ Tell me what you want, what yo… ┆ Tell what want, what really… │
1454
+ # # │ Can you feel the love tonight ┆ Can feel the love tonight │
1455
+ # # └─────────────────────────────────┴─────────────────────────────────┘
1456
+ #
1457
+ # @example
1458
+ # df.with_columns(
1459
+ # Polars.col("lyrics")
1460
+ # .str.replace_many(
1461
+ # ["me", "you"],
1462
+ # ["you", "me"]
1463
+ # )
1464
+ # .alias("confusing")
1465
+ # )
1466
+ # # =>
1467
+ # # shape: (3, 2)
1468
+ # # ┌─────────────────────────────────┬─────────────────────────────────┐
1469
+ # # │ lyrics ┆ confusing │
1470
+ # # │ --- ┆ --- │
1471
+ # # │ str ┆ str │
1472
+ # # ╞═════════════════════════════════╪═════════════════════════════════╡
1473
+ # # │ Everybody wants to rule the wo… ┆ Everybody wants to rule the wo… │
1474
+ # # │ Tell me what you want, what yo… ┆ Tell you what me want, what me… │
1475
+ # # │ Can you feel the love tonight ┆ Can me feel the love tonight │
1476
+ # # └─────────────────────────────────┴─────────────────────────────────┘
1477
+ def replace_many(patterns, replace_with, ascii_case_insensitive: false)
1478
+ patterns = Utils.parse_into_expression(patterns, str_as_lit: false, list_as_series: true)
1479
+ replace_with = Utils.parse_into_expression(
1480
+ replace_with, str_as_lit: true, list_as_series: true
1481
+ )
1482
+ Utils.wrap_expr(
1483
+ _rbexpr.str_replace_many(
1484
+ patterns, replace_with, ascii_case_insensitive
1485
+ )
1486
+ )
1487
+ end
1488
+
1489
+ private
1490
+
1491
+ def _validate_format_argument(format)
1492
+ # TODO
1493
+ end
1494
+ end
1495
+ end