polars-df 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,7 +1,10 @@
1
1
  module Polars
2
+ # Namespace for string related expressions.
2
3
  class StringExpr
4
+ # @private
3
5
  attr_accessor :_rbexpr
4
6
 
7
+ # @private
5
8
  def initialize(expr)
6
9
  self._rbexpr = expr._rbexpr
7
10
  end
@@ -9,26 +12,166 @@ module Polars
9
12
  # def strptime
10
13
  # end
11
14
 
15
+ # Get length of the strings as `:u32` (as number of bytes).
16
+ #
17
+ # @return [Expr]
18
+ #
19
+ # @note
20
+ # The returned lengths are equal to the number of bytes in the UTF8 string. If you
21
+ # need the length in terms of the number of characters, use `n_chars` instead.
22
+ #
23
+ # @example
24
+ # df = Polars::DataFrame.new({"s" => ["Café", nil, "345", "東京"]}).with_columns(
25
+ # [
26
+ # Polars.col("s").str.lengths.alias("length"),
27
+ # Polars.col("s").str.n_chars.alias("nchars")
28
+ # ]
29
+ # )
30
+ # df
31
+ # # =>
32
+ # # shape: (4, 3)
33
+ # # ┌──────┬────────┬────────┐
34
+ # # │ s ┆ length ┆ nchars │
35
+ # # │ --- ┆ --- ┆ --- │
36
+ # # │ str ┆ u32 ┆ u32 │
37
+ # # ╞══════╪════════╪════════╡
38
+ # # │ Café ┆ 5 ┆ 4 │
39
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
40
+ # # │ null ┆ null ┆ null │
41
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
42
+ # # │ 345 ┆ 3 ┆ 3 │
43
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
44
+ # # │ 東京 ┆ 6 ┆ 2 │
45
+ # # └──────┴────────┴────────┘
12
46
  def lengths
13
47
  Utils.wrap_expr(_rbexpr.str_lengths)
14
48
  end
15
49
 
50
+ # Get length of the strings as `:u32` (as number of chars).
51
+ #
52
+ # @return [Expr]
53
+ #
54
+ # @note
55
+ # If you know that you are working with ASCII text, `lengths` will be
56
+ # equivalent, and faster (returns length in terms of the number of bytes).
57
+ #
58
+ # @example
59
+ # df = Polars::DataFrame.new({"s" => ["Café", nil, "345", "東京"]}).with_columns(
60
+ # [
61
+ # Polars.col("s").str.lengths.alias("length"),
62
+ # Polars.col("s").str.n_chars.alias("nchars")
63
+ # ]
64
+ # )
65
+ # df
66
+ # # =>
67
+ # # shape: (4, 3)
68
+ # # ┌──────┬────────┬────────┐
69
+ # # │ s ┆ length ┆ nchars │
70
+ # # │ --- ┆ --- ┆ --- │
71
+ # # │ str ┆ u32 ┆ u32 │
72
+ # # ╞══════╪════════╪════════╡
73
+ # # │ Café ┆ 5 ┆ 4 │
74
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
75
+ # # │ null ┆ null ┆ null │
76
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
77
+ # # │ 345 ┆ 3 ┆ 3 │
78
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
79
+ # # │ 東京 ┆ 6 ┆ 2 │
80
+ # # └──────┴────────┴────────┘
16
81
  def n_chars
17
82
  Utils.wrap_expr(_rbexpr.str_n_chars)
18
83
  end
19
84
 
85
+ # Vertically concat the values in the Series to a single string value.
86
+ #
87
+ # @param delimiter [String]
88
+ # The delimiter to insert between consecutive string values.
89
+ #
90
+ # @return [Expr]
91
+ #
92
+ # @example
93
+ # df = Polars::DataFrame.new({"foo" => [1, nil, 2]})
94
+ # df.select(Polars.col("foo").str.concat("-"))
95
+ # # =>
96
+ # # shape: (1, 1)
97
+ # # ┌──────────┐
98
+ # # │ foo │
99
+ # # │ --- │
100
+ # # │ str │
101
+ # # ╞══════════╡
102
+ # # │ 1-null-2 │
103
+ # # └──────────┘
20
104
  def concat(delimiter = "-")
21
105
  Utils.wrap_expr(_rbexpr.str_concat(delimiter))
22
106
  end
23
107
 
108
+ # Transform to uppercase variant.
109
+ #
110
+ # @return [Expr]
111
+ #
112
+ # @example
113
+ # df = Polars::DataFrame.new({"foo" => ["cat", "dog"]})
114
+ # df.select(Polars.col("foo").str.to_uppercase)
115
+ # # =>
116
+ # # shape: (2, 1)
117
+ # # ┌─────┐
118
+ # # │ foo │
119
+ # # │ --- │
120
+ # # │ str │
121
+ # # ╞═════╡
122
+ # # │ CAT │
123
+ # # ├╌╌╌╌╌┤
124
+ # # │ DOG │
125
+ # # └─────┘
24
126
  def to_uppercase
25
127
  Utils.wrap_expr(_rbexpr.str_to_uppercase)
26
128
  end
27
129
 
130
+ # Transform to lowercase variant.
131
+ #
132
+ # @return [Expr]
133
+ #
134
+ # @example
135
+ # df = Polars::DataFrame.new({"foo" => ["CAT", "DOG"]})
136
+ # df.select(Polars.col("foo").str.to_lowercase)
137
+ # # =>
138
+ # # shape: (2, 1)
139
+ # # ┌─────┐
140
+ # # │ foo │
141
+ # # │ --- │
142
+ # # │ str │
143
+ # # ╞═════╡
144
+ # # │ cat │
145
+ # # ├╌╌╌╌╌┤
146
+ # # │ dog │
147
+ # # └─────┘
28
148
  def to_lowercase
29
149
  Utils.wrap_expr(_rbexpr.str_to_lowercase)
30
150
  end
31
151
 
152
+ # Remove leading and trailing whitespace.
153
+ #
154
+ # @param matches [String, nil]
155
+ # An optional single character that should be trimmed.
156
+ #
157
+ # @return [Expr]
158
+ #
159
+ # @example
160
+ # df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
161
+ # df.select(Polars.col("foo").str.strip)
162
+ # # =>
163
+ # # shape: (3, 1)
164
+ # # ┌───────┐
165
+ # # │ foo │
166
+ # # │ --- │
167
+ # # │ str │
168
+ # # ╞═══════╡
169
+ # # │ lead │
170
+ # # ├╌╌╌╌╌╌╌┤
171
+ # # │ trail │
172
+ # # ├╌╌╌╌╌╌╌┤
173
+ # # │ both │
174
+ # # └───────┘
32
175
  def strip(matches = nil)
33
176
  if !matches.nil? && matches.length > 1
34
177
  raise ArgumentError, "matches should contain a single character"
@@ -36,6 +179,29 @@ module Polars
36
179
  Utils.wrap_expr(_rbexpr.str_strip(matches))
37
180
  end
38
181
 
182
+ # Remove leading whitespace.
183
+ #
184
+ # @param matches [String, nil]
185
+ # An optional single character that should be trimmed.
186
+ #
187
+ # @return [Expr]
188
+ #
189
+ # @example
190
+ # df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
191
+ # df.select(Polars.col("foo").str.lstrip)
192
+ # # =>
193
+ # # shape: (3, 1)
194
+ # # ┌────────┐
195
+ # # │ foo │
196
+ # # │ --- │
197
+ # # │ str │
198
+ # # ╞════════╡
199
+ # # │ lead │
200
+ # # ├╌╌╌╌╌╌╌╌┤
201
+ # # │ trail │
202
+ # # ├╌╌╌╌╌╌╌╌┤
203
+ # # │ both │
204
+ # # └────────┘
39
205
  def lstrip(matches = nil)
40
206
  if !matches.nil? && matches.length > 1
41
207
  raise ArgumentError, "matches should contain a single character"
@@ -43,6 +209,29 @@ module Polars
43
209
  Utils.wrap_expr(_rbexpr.str_lstrip(matches))
44
210
  end
45
211
 
212
+ # Remove trailing whitespace.
213
+ #
214
+ # @param matches [String, nil]
215
+ # An optional single character that should be trimmed.
216
+ #
217
+ # @return [Expr]
218
+ #
219
+ # @example
220
+ # df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
221
+ # df.select(Polars.col("foo").str.rstrip)
222
+ # # =>
223
+ # # shape: (3, 1)
224
+ # # ┌───────┐
225
+ # # │ foo │
226
+ # # │ --- │
227
+ # # │ str │
228
+ # # ╞═══════╡
229
+ # # │ lead │
230
+ # # ├╌╌╌╌╌╌╌┤
231
+ # # │ trail │
232
+ # # ├╌╌╌╌╌╌╌┤
233
+ # # │ both │
234
+ # # └───────┘
46
235
  def rstrip(matches = nil)
47
236
  if !matches.nil? && matches.length > 1
48
237
  raise ArgumentError, "matches should contain a single character"
@@ -50,26 +239,241 @@ module Polars
50
239
  Utils.wrap_expr(_rbexpr.str_rstrip(matches))
51
240
  end
52
241
 
242
+ # Fills the string with zeroes.
243
+ #
244
+ # Return a copy of the string left filled with ASCII '0' digits to make a string
245
+ # of length width.
246
+ #
247
+ # A leading sign prefix ('+'/'-') is handled by inserting the padding after the
248
+ # sign character rather than before. The original string is returned if width is
249
+ # less than or equal to `s.length`.
250
+ #
251
+ # @param alignment [Integer]
252
+ # Fill the value up to this length
253
+ #
254
+ # @return [Expr]
255
+ #
256
+ # @example
257
+ # df = Polars::DataFrame.new(
258
+ # {
259
+ # "num" => [-10, -1, 0, 1, 10, 100, 1000, 10000, 100000, 1000000, nil]
260
+ # }
261
+ # )
262
+ # df.with_column(Polars.col("num").cast(String).str.zfill(5))
263
+ # # =>
264
+ # # shape: (11, 1)
265
+ # # ┌─────────┐
266
+ # # │ num │
267
+ # # │ --- │
268
+ # # │ str │
269
+ # # ╞═════════╡
270
+ # # │ -0010 │
271
+ # # ├╌╌╌╌╌╌╌╌╌┤
272
+ # # │ -0001 │
273
+ # # ├╌╌╌╌╌╌╌╌╌┤
274
+ # # │ 00000 │
275
+ # # ├╌╌╌╌╌╌╌╌╌┤
276
+ # # │ 00001 │
277
+ # # ├╌╌╌╌╌╌╌╌╌┤
278
+ # # │ ... │
279
+ # # ├╌╌╌╌╌╌╌╌╌┤
280
+ # # │ 10000 │
281
+ # # ├╌╌╌╌╌╌╌╌╌┤
282
+ # # │ 100000 │
283
+ # # ├╌╌╌╌╌╌╌╌╌┤
284
+ # # │ 1000000 │
285
+ # # ├╌╌╌╌╌╌╌╌╌┤
286
+ # # │ null │
287
+ # # └─────────┘
53
288
  def zfill(alignment)
54
289
  Utils.wrap_expr(_rbexpr.str_zfill(alignment))
55
290
  end
56
291
 
292
+ # Return the string left justified in a string of length `width`.
293
+ #
294
+ # Padding is done using the specified `fillcha``.
295
+ # The original string is returned if `width` is less than or equal to
296
+ # `s.length`.
297
+ #
298
+ # @param width [Integer]
299
+ # Justify left to this length.
300
+ # @param fillchar [String]
301
+ # Fill with this ASCII character.
302
+ #
303
+ # @return [Expr]
304
+ #
305
+ # @example
306
+ # df = Polars::DataFrame.new({"a" => ["cow", "monkey", nil, "hippopotamus"]})
307
+ # df.select(Polars.col("a").str.ljust(8, "*"))
308
+ # # =>
309
+ # # shape: (4, 1)
310
+ # # ┌──────────────┐
311
+ # # │ a │
312
+ # # │ --- │
313
+ # # │ str │
314
+ # # ╞══════════════╡
315
+ # # │ cow***** │
316
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
317
+ # # │ monkey** │
318
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
319
+ # # │ null │
320
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
321
+ # # │ hippopotamus │
322
+ # # └──────────────┘
57
323
  def ljust(width, fillchar = " ")
58
324
  Utils.wrap_expr(_rbexpr.str_ljust(width, fillchar))
59
325
  end
60
326
 
327
+ # Return the string right justified in a string of length ``width``.
328
+ #
329
+ # Padding is done using the specified `fillchar`.
330
+ # The original string is returned if `width` is less than or equal to
331
+ # `s.length`.
332
+ #
333
+ # @param width [Integer]
334
+ # Justify right to this length.
335
+ # @param fillchar [String]
336
+ # Fill with this ASCII character.
337
+ #
338
+ # @return [Expr]
339
+ #
340
+ # @example
341
+ # df = Polars::DataFrame.new({"a" => ["cow", "monkey", nil, "hippopotamus"]})
342
+ # df.select(Polars.col("a").str.rjust(8, "*"))
343
+ # # =>
344
+ # # shape: (4, 1)
345
+ # # ┌──────────────┐
346
+ # # │ a │
347
+ # # │ --- │
348
+ # # │ str │
349
+ # # ╞══════════════╡
350
+ # # │ *****cow │
351
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
352
+ # # │ **monkey │
353
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
354
+ # # │ null │
355
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
356
+ # # │ hippopotamus │
357
+ # # └──────────────┘
61
358
  def rjust(width, fillchar = " ")
62
359
  Utils.wrap_expr(_rbexpr.str_rjust(width, fillchar))
63
360
  end
64
361
 
362
+ # Check if string contains a substring that matches a regex.
363
+ #
364
+ # @param pattern [String]
365
+ # A valid regex pattern.
366
+ # @param literal [Boolean]
367
+ # Treat pattern as a literal string.
368
+ #
369
+ # @return [Expr]
370
+ #
371
+ # @example
372
+ # df = Polars::DataFrame.new({"a" => ["Crab", "cat and dog", "rab$bit", nil]})
373
+ # df.select(
374
+ # [
375
+ # Polars.col("a"),
376
+ # Polars.col("a").str.contains("cat|bit").alias("regex"),
377
+ # Polars.col("a").str.contains("rab$", literal: true).alias("literal")
378
+ # ]
379
+ # )
380
+ # # =>
381
+ # # shape: (4, 3)
382
+ # # ┌─────────────┬───────┬─────────┐
383
+ # # │ a ┆ regex ┆ literal │
384
+ # # │ --- ┆ --- ┆ --- │
385
+ # # │ str ┆ bool ┆ bool │
386
+ # # ╞═════════════╪═══════╪═════════╡
387
+ # # │ Crab ┆ false ┆ false │
388
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
389
+ # # │ cat and dog ┆ true ┆ false │
390
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
391
+ # # │ rab$bit ┆ true ┆ true │
392
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
393
+ # # │ null ┆ null ┆ null │
394
+ # # └─────────────┴───────┴─────────┘
65
395
  def contains(pattern, literal: false)
66
396
  Utils.wrap_expr(_rbexpr.str_contains(pattern, literal))
67
397
  end
68
398
 
399
+ # Check if string values end with a substring.
400
+ #
401
+ # @param sub [String]
402
+ # Suffix substring.
403
+ #
404
+ # @return [Expr]
405
+ #
406
+ # @example
407
+ # df = Polars::DataFrame.new({"fruits" => ["apple", "mango", nil]})
408
+ # df.with_column(
409
+ # Polars.col("fruits").str.ends_with("go").alias("has_suffix")
410
+ # )
411
+ # # =>
412
+ # # shape: (3, 2)
413
+ # # ┌────────┬────────────┐
414
+ # # │ fruits ┆ has_suffix │
415
+ # # │ --- ┆ --- │
416
+ # # │ str ┆ bool │
417
+ # # ╞════════╪════════════╡
418
+ # # │ apple ┆ false │
419
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
420
+ # # │ mango ┆ true │
421
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
422
+ # # │ null ┆ null │
423
+ # # └────────┴────────────┘
424
+ #
425
+ # @example Using `ends_with` as a filter condition:
426
+ # df.filter(Polars.col("fruits").str.ends_with("go"))
427
+ # # =>
428
+ # # shape: (1, 1)
429
+ # # ┌────────┐
430
+ # # │ fruits │
431
+ # # │ --- │
432
+ # # │ str │
433
+ # # ╞════════╡
434
+ # # │ mango │
435
+ # # └────────┘
69
436
  def ends_with(sub)
70
437
  Utils.wrap_expr(_rbexpr.str_ends_with(sub))
71
438
  end
72
439
 
440
+ # Check if string values start with a substring.
441
+ #
442
+ # @param sub [String]
443
+ # Prefix substring.
444
+ #
445
+ # @return [Expr]
446
+ #
447
+ # @example
448
+ # df = Polars::DataFrame.new({"fruits" => ["apple", "mango", nil]})
449
+ # df.with_column(
450
+ # Polars.col("fruits").str.starts_with("app").alias("has_prefix")
451
+ # )
452
+ # # =>
453
+ # # shape: (3, 2)
454
+ # # ┌────────┬────────────┐
455
+ # # │ fruits ┆ has_prefix │
456
+ # # │ --- ┆ --- │
457
+ # # │ str ┆ bool │
458
+ # # ╞════════╪════════════╡
459
+ # # │ apple ┆ true │
460
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
461
+ # # │ mango ┆ false │
462
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
463
+ # # │ null ┆ null │
464
+ # # └────────┴────────────┘
465
+ #
466
+ # @example Using `starts_with` as a filter condition:
467
+ # df.filter(Polars.col("fruits").str.starts_with("app"))
468
+ # # =>
469
+ # # shape: (1, 1)
470
+ # # ┌────────┐
471
+ # # │ fruits │
472
+ # # │ --- │
473
+ # # │ str │
474
+ # # ╞════════╡
475
+ # # │ apple │
476
+ # # └────────┘
73
477
  def starts_with(sub)
74
478
  Utils.wrap_expr(_rbexpr.str_starts_with(sub))
75
479
  end
@@ -83,18 +487,125 @@ module Polars
83
487
  # def encode
84
488
  # end
85
489
 
490
+ # Extract the target capture group from provided patterns.
491
+ #
492
+ # @param pattern [String]
493
+ # A valid regex pattern
494
+ # @param group_index [Integer]
495
+ # Index of the targeted capture group.
496
+ # Group 0 mean the whole pattern, first group begin at index 1
497
+ # Default to the first capture group
498
+ #
499
+ # @return [Expr]
500
+ #
501
+ # @example
502
+ # df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
503
+ # df.select(
504
+ # [
505
+ # Polars.col("foo").str.extract('(\d+)')
506
+ # ]
507
+ # )
508
+ # # =>
509
+ # # shape: (2, 1)
510
+ # # ┌─────┐
511
+ # # │ foo │
512
+ # # │ --- │
513
+ # # │ str │
514
+ # # ╞═════╡
515
+ # # │ 123 │
516
+ # # ├╌╌╌╌╌┤
517
+ # # │ 678 │
518
+ # # └─────┘
86
519
  def extract(pattern, group_index: 1)
87
520
  Utils.wrap_expr(_rbexpr.str_extract(pattern, group_index))
88
521
  end
89
522
 
523
+ # Extracts all matches for the given regex pattern.
524
+ #
525
+ # Extracts each successive non-overlapping regex match in an individual string as
526
+ # an array.
527
+ #
528
+ # @param pattern [String]
529
+ # A valid regex pattern
530
+ #
531
+ # @return [Expr]
532
+ #
533
+ # @example
534
+ # df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
535
+ # df.select(
536
+ # [
537
+ # Polars.col("foo").str.extract_all('(\d+)').alias("extracted_nrs")
538
+ # ]
539
+ # )
540
+ # # =>
541
+ # # shape: (2, 1)
542
+ # # ┌────────────────┐
543
+ # # │ extracted_nrs │
544
+ # # │ --- │
545
+ # # │ list[str] │
546
+ # # ╞════════════════╡
547
+ # # │ ["123", "45"] │
548
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
549
+ # # │ ["678", "910"] │
550
+ # # └────────────────┘
90
551
  def extract_all(pattern)
91
552
  Utils.wrap_expr(_rbexpr.str_extract_all(pattern))
92
553
  end
93
554
 
555
+ # Count all successive non-overlapping regex matches.
556
+ #
557
+ # @param pattern [String]
558
+ # A valid regex pattern
559
+ #
560
+ # @return [Expr]
561
+ #
562
+ # @example
563
+ # df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
564
+ # df.select(
565
+ # [
566
+ # Polars.col("foo").str.count_match('\d').alias("count_digits")
567
+ # ]
568
+ # )
569
+ # # =>
570
+ # # shape: (2, 1)
571
+ # # ┌──────────────┐
572
+ # # │ count_digits │
573
+ # # │ --- │
574
+ # # │ u32 │
575
+ # # ╞══════════════╡
576
+ # # │ 5 │
577
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
578
+ # # │ 6 │
579
+ # # └──────────────┘
94
580
  def count_match(pattern)
95
581
  Utils.wrap_expr(_rbexpr.count_match(pattern))
96
582
  end
97
583
 
584
+ # Split the string by a substring.
585
+ #
586
+ # @param by [String]
587
+ # Substring to split by.
588
+ # @param inclusive [Boolean]
589
+ # If true, include the split character/string in the results.
590
+ #
591
+ # @return [Expr]
592
+ #
593
+ # @example
594
+ # df = Polars::DataFrame.new({"s" => ["foo bar", "foo-bar", "foo bar baz"]})
595
+ # df.select(Polars.col("s").str.split(" "))
596
+ # # =>
597
+ # # shape: (3, 1)
598
+ # # ┌───────────────────────┐
599
+ # # │ s │
600
+ # # │ --- │
601
+ # # │ list[str] │
602
+ # # ╞═══════════════════════╡
603
+ # # │ ["foo", "bar"] │
604
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
605
+ # # │ ["foo-bar"] │
606
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
607
+ # # │ ["foo", "bar", "baz"] │
608
+ # # └───────────────────────┘
98
609
  def split(by, inclusive: false)
99
610
  if inclusive
100
611
  Utils.wrap_expr(_rbexpr.str_split_inclusive(by))
@@ -103,6 +614,43 @@ module Polars
103
614
  end
104
615
  end
105
616
 
617
+ # Split the string by a substring using `n` splits.
618
+ #
619
+ # Results in a struct of `n+1` fields.
620
+ #
621
+ # If it cannot make `n` splits, the remaining field elements will be null.
622
+ #
623
+ # @param by [String]
624
+ # Substring to split by.
625
+ # @param n [Integer]
626
+ # Number of splits to make.
627
+ # @param inclusive [Boolean]
628
+ # If true, include the split character/string in the results.
629
+ #
630
+ # @return [Expr]
631
+ #
632
+ # @example
633
+ # df = Polars::DataFrame.new({"x" => ["a_1", nil, "c", "d_4"]})
634
+ # df.select(
635
+ # [
636
+ # Polars.col("x").str.split_exact("_", 1).alias("fields")
637
+ # ]
638
+ # )
639
+ # # =>
640
+ # # shape: (4, 1)
641
+ # # ┌─────────────┐
642
+ # # │ fields │
643
+ # # │ --- │
644
+ # # │ struct[2] │
645
+ # # ╞═════════════╡
646
+ # # │ {"a","1"} │
647
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┤
648
+ # # │ {null,null} │
649
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┤
650
+ # # │ {"c",null} │
651
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┤
652
+ # # │ {"d","4"} │
653
+ # # └─────────────┘
106
654
  def split_exact(by, n, inclusive: false)
107
655
  if inclusive
108
656
  Utils.wrap_expr(_rbexpr.str_split_exact_inclusive(by, n))
@@ -111,22 +659,135 @@ module Polars
111
659
  end
112
660
  end
113
661
 
662
+ # Split the string by a substring, restricted to returning at most ``n`` items.
663
+ #
664
+ # If the number of possible splits is less than ``n-1``, the remaining field
665
+ # elements will be null. If the number of possible splits is ``n-1`` or greater,
666
+ # the last (nth) substring will contain the remainder of the string.
667
+ #
668
+ # @param by [String]
669
+ # Substring to split by.
670
+ # @param n [Integer]
671
+ # Max number of items to return.
672
+ #
673
+ # @return [Expr]
674
+ #
675
+ # @example
676
+ # df = Polars::DataFrame.new({"s" => ["foo bar", nil, "foo-bar", "foo bar baz"]})
677
+ # df.select(Polars.col("s").str.splitn(" ", 2).alias("fields"))
678
+ # # =>
679
+ # # shape: (4, 1)
680
+ # # ┌───────────────────┐
681
+ # # │ fields │
682
+ # # │ --- │
683
+ # # │ struct[2] │
684
+ # # ╞═══════════════════╡
685
+ # # │ {"foo","bar"} │
686
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
687
+ # # │ {null,null} │
688
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
689
+ # # │ {"foo-bar",null} │
690
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
691
+ # # │ {"foo","bar baz"} │
692
+ # # └───────────────────┘
114
693
  def splitn(by, n)
115
694
  Utils.wrap_expr(_rbexpr.str_splitn(by, n))
116
695
  end
117
696
 
118
- def replace(pattern, literal: false)
697
+ # Replace first matching regex/literal substring with a new string value.
698
+ #
699
+ # @param pattern [String]
700
+ # Regex pattern.
701
+ # @param value [String]
702
+ # Replacement string.
703
+ # @param literal [Boolean]
704
+ # Treat pattern as a literal string.
705
+ #
706
+ # @return [Expr]
707
+ #
708
+ # @example
709
+ # df = Polars::DataFrame.new({"id" => [1, 2], "text" => ["123abc", "abc456"]})
710
+ # df.with_column(
711
+ # Polars.col("text").str.replace('abc\b', "ABC")
712
+ # )
713
+ # # =>
714
+ # # shape: (2, 2)
715
+ # # ┌─────┬────────┐
716
+ # # │ id ┆ text │
717
+ # # │ --- ┆ --- │
718
+ # # │ i64 ┆ str │
719
+ # # ╞═════╪════════╡
720
+ # # │ 1 ┆ 123ABC │
721
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
722
+ # # │ 2 ┆ abc456 │
723
+ # # └─────┴────────┘
724
+ def replace(pattern, value, literal: false)
119
725
  pattern = Utils.expr_to_lit_or_expr(pattern, str_to_lit: true)
120
726
  value = Utils.expr_to_lit_or_expr(value, str_to_lit: true)
121
727
  Utils.wrap_expr(_rbexpr.str_replace(pattern._rbexpr, value._rbexpr, literal))
122
728
  end
123
729
 
124
- def replace_all(pattern, literal: false)
730
+ # Replace all matching regex/literal substrings with a new string value.
731
+ #
732
+ # @param pattern [String]
733
+ # Regex pattern.
734
+ # @param value [String]
735
+ # Replacement string.
736
+ # @param literal [Boolean]
737
+ # Treat pattern as a literal string.
738
+ #
739
+ # @return [Expr]
740
+ #
741
+ # @example
742
+ # df = Polars::DataFrame.new({"id" => [1, 2], "text" => ["abcabc", "123a123"]})
743
+ # df.with_column(Polars.col("text").str.replace_all("a", "-"))
744
+ # # =>
745
+ # # shape: (2, 2)
746
+ # # ┌─────┬─────────┐
747
+ # # │ id ┆ text │
748
+ # # │ --- ┆ --- │
749
+ # # │ i64 ┆ str │
750
+ # # ╞═════╪═════════╡
751
+ # # │ 1 ┆ -bc-bc │
752
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
753
+ # # │ 2 ┆ 123-123 │
754
+ # # └─────┴─────────┘
755
+ def replace_all(pattern, value, literal: false)
125
756
  pattern = Utils.expr_to_lit_or_expr(pattern, str_to_lit: true)
126
757
  value = Utils.expr_to_lit_or_expr(value, str_to_lit: true)
127
758
  Utils.wrap_expr(_rbexpr.str_replace_all(pattern._rbexpr, value._rbexpr, literal))
128
759
  end
129
760
 
761
+ # Create subslices of the string values of a Utf8 Series.
762
+ #
763
+ # @param offset [Integer]
764
+ # Start index. Negative indexing is supported.
765
+ # @param length [Integer]
766
+ # Length of the slice. If set to `nil` (default), the slice is taken to the
767
+ # end of the string.
768
+ #
769
+ # @return [Expr]
770
+ #
771
+ # @example
772
+ # df = Polars::DataFrame.new({"s" => ["pear", nil, "papaya", "dragonfruit"]})
773
+ # df.with_column(
774
+ # Polars.col("s").str.slice(-3).alias("s_sliced")
775
+ # )
776
+ # # =>
777
+ # # shape: (4, 2)
778
+ # # ┌─────────────┬──────────┐
779
+ # # │ s ┆ s_sliced │
780
+ # # │ --- ┆ --- │
781
+ # # │ str ┆ str │
782
+ # # ╞═════════════╪══════════╡
783
+ # # │ pear ┆ ear │
784
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
785
+ # # │ null ┆ null │
786
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
787
+ # # │ papaya ┆ aya │
788
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
789
+ # # │ dragonfruit ┆ uit │
790
+ # # └─────────────┴──────────┘
130
791
  def slice(offset, length = nil)
131
792
  Utils.wrap_expr(_rbexpr.str_slice(offset, length))
132
793
  end