polars-df 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +3 -0
  3. data/CHANGELOG.md +8 -0
  4. data/Cargo.lock +2 -1
  5. data/README.md +1 -1
  6. data/ext/polars/Cargo.toml +7 -1
  7. data/ext/polars/src/batched_csv.rs +120 -0
  8. data/ext/polars/src/conversion.rs +139 -6
  9. data/ext/polars/src/dataframe.rs +360 -15
  10. data/ext/polars/src/error.rs +9 -0
  11. data/ext/polars/src/file.rs +8 -7
  12. data/ext/polars/src/lazy/apply.rs +7 -0
  13. data/ext/polars/src/lazy/dataframe.rs +135 -3
  14. data/ext/polars/src/lazy/dsl.rs +97 -2
  15. data/ext/polars/src/lazy/meta.rs +1 -1
  16. data/ext/polars/src/lazy/mod.rs +1 -0
  17. data/ext/polars/src/lib.rs +227 -12
  18. data/ext/polars/src/series.rs +190 -38
  19. data/ext/polars/src/set.rs +91 -0
  20. data/ext/polars/src/utils.rs +19 -0
  21. data/lib/polars/batched_csv_reader.rb +96 -0
  22. data/lib/polars/cat_expr.rb +39 -0
  23. data/lib/polars/data_frame.rb +2813 -100
  24. data/lib/polars/date_time_expr.rb +1282 -7
  25. data/lib/polars/exceptions.rb +20 -0
  26. data/lib/polars/expr.rb +631 -11
  27. data/lib/polars/expr_dispatch.rb +14 -0
  28. data/lib/polars/functions.rb +219 -0
  29. data/lib/polars/group_by.rb +517 -0
  30. data/lib/polars/io.rb +763 -4
  31. data/lib/polars/lazy_frame.rb +1415 -67
  32. data/lib/polars/lazy_functions.rb +430 -9
  33. data/lib/polars/lazy_group_by.rb +79 -0
  34. data/lib/polars/list_expr.rb +5 -0
  35. data/lib/polars/meta_expr.rb +21 -0
  36. data/lib/polars/series.rb +2244 -192
  37. data/lib/polars/slice.rb +104 -0
  38. data/lib/polars/string_expr.rb +663 -2
  39. data/lib/polars/struct_expr.rb +73 -0
  40. data/lib/polars/utils.rb +76 -3
  41. data/lib/polars/version.rb +2 -1
  42. data/lib/polars/when.rb +1 -0
  43. data/lib/polars/when_then.rb +1 -0
  44. data/lib/polars.rb +8 -2
  45. metadata +12 -2
@@ -1,7 +1,10 @@
1
1
  module Polars
2
+ # Namespace for string related expressions.
2
3
  class StringExpr
4
+ # @private
3
5
  attr_accessor :_rbexpr
4
6
 
7
+ # @private
5
8
  def initialize(expr)
6
9
  self._rbexpr = expr._rbexpr
7
10
  end
@@ -9,26 +12,166 @@ module Polars
9
12
  # def strptime
10
13
  # end
11
14
 
15
+ # Get length of the strings as `:u32` (as number of bytes).
16
+ #
17
+ # @return [Expr]
18
+ #
19
+ # @note
20
+ # The returned lengths are equal to the number of bytes in the UTF8 string. If you
21
+ # need the length in terms of the number of characters, use `n_chars` instead.
22
+ #
23
+ # @example
24
+ # df = Polars::DataFrame.new({"s" => ["Café", nil, "345", "東京"]}).with_columns(
25
+ # [
26
+ # Polars.col("s").str.lengths.alias("length"),
27
+ # Polars.col("s").str.n_chars.alias("nchars")
28
+ # ]
29
+ # )
30
+ # df
31
+ # # =>
32
+ # # shape: (4, 3)
33
+ # # ┌──────┬────────┬────────┐
34
+ # # │ s ┆ length ┆ nchars │
35
+ # # │ --- ┆ --- ┆ --- │
36
+ # # │ str ┆ u32 ┆ u32 │
37
+ # # ╞══════╪════════╪════════╡
38
+ # # │ Café ┆ 5 ┆ 4 │
39
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
40
+ # # │ null ┆ null ┆ null │
41
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
42
+ # # │ 345 ┆ 3 ┆ 3 │
43
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
44
+ # # │ 東京 ┆ 6 ┆ 2 │
45
+ # # └──────┴────────┴────────┘
12
46
  def lengths
13
47
  Utils.wrap_expr(_rbexpr.str_lengths)
14
48
  end
15
49
 
50
+ # Get length of the strings as `:u32` (as number of chars).
51
+ #
52
+ # @return [Expr]
53
+ #
54
+ # @note
55
+ # If you know that you are working with ASCII text, `lengths` will be
56
+ # equivalent, and faster (returns length in terms of the number of bytes).
57
+ #
58
+ # @example
59
+ # df = Polars::DataFrame.new({"s" => ["Café", nil, "345", "東京"]}).with_columns(
60
+ # [
61
+ # Polars.col("s").str.lengths.alias("length"),
62
+ # Polars.col("s").str.n_chars.alias("nchars")
63
+ # ]
64
+ # )
65
+ # df
66
+ # # =>
67
+ # # shape: (4, 3)
68
+ # # ┌──────┬────────┬────────┐
69
+ # # │ s ┆ length ┆ nchars │
70
+ # # │ --- ┆ --- ┆ --- │
71
+ # # │ str ┆ u32 ┆ u32 │
72
+ # # ╞══════╪════════╪════════╡
73
+ # # │ Café ┆ 5 ┆ 4 │
74
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
75
+ # # │ null ┆ null ┆ null │
76
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
77
+ # # │ 345 ┆ 3 ┆ 3 │
78
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
79
+ # # │ 東京 ┆ 6 ┆ 2 │
80
+ # # └──────┴────────┴────────┘
16
81
  def n_chars
17
82
  Utils.wrap_expr(_rbexpr.str_n_chars)
18
83
  end
19
84
 
85
+ # Vertically concat the values in the Series to a single string value.
86
+ #
87
+ # @param delimiter [String]
88
+ # The delimiter to insert between consecutive string values.
89
+ #
90
+ # @return [Expr]
91
+ #
92
+ # @example
93
+ # df = Polars::DataFrame.new({"foo" => [1, nil, 2]})
94
+ # df.select(Polars.col("foo").str.concat("-"))
95
+ # # =>
96
+ # # shape: (1, 1)
97
+ # # ┌──────────┐
98
+ # # │ foo │
99
+ # # │ --- │
100
+ # # │ str │
101
+ # # ╞══════════╡
102
+ # # │ 1-null-2 │
103
+ # # └──────────┘
20
104
  def concat(delimiter = "-")
21
105
  Utils.wrap_expr(_rbexpr.str_concat(delimiter))
22
106
  end
23
107
 
108
+ # Transform to uppercase variant.
109
+ #
110
+ # @return [Expr]
111
+ #
112
+ # @example
113
+ # df = Polars::DataFrame.new({"foo" => ["cat", "dog"]})
114
+ # df.select(Polars.col("foo").str.to_uppercase)
115
+ # # =>
116
+ # # shape: (2, 1)
117
+ # # ┌─────┐
118
+ # # │ foo │
119
+ # # │ --- │
120
+ # # │ str │
121
+ # # ╞═════╡
122
+ # # │ CAT │
123
+ # # ├╌╌╌╌╌┤
124
+ # # │ DOG │
125
+ # # └─────┘
24
126
  def to_uppercase
25
127
  Utils.wrap_expr(_rbexpr.str_to_uppercase)
26
128
  end
27
129
 
130
+ # Transform to lowercase variant.
131
+ #
132
+ # @return [Expr]
133
+ #
134
+ # @example
135
+ # df = Polars::DataFrame.new({"foo" => ["CAT", "DOG"]})
136
+ # df.select(Polars.col("foo").str.to_lowercase)
137
+ # # =>
138
+ # # shape: (2, 1)
139
+ # # ┌─────┐
140
+ # # │ foo │
141
+ # # │ --- │
142
+ # # │ str │
143
+ # # ╞═════╡
144
+ # # │ cat │
145
+ # # ├╌╌╌╌╌┤
146
+ # # │ dog │
147
+ # # └─────┘
28
148
  def to_lowercase
29
149
  Utils.wrap_expr(_rbexpr.str_to_lowercase)
30
150
  end
31
151
 
152
+ # Remove leading and trailing whitespace.
153
+ #
154
+ # @param matches [String, nil]
155
+ # An optional single character that should be trimmed.
156
+ #
157
+ # @return [Expr]
158
+ #
159
+ # @example
160
+ # df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
161
+ # df.select(Polars.col("foo").str.strip)
162
+ # # =>
163
+ # # shape: (3, 1)
164
+ # # ┌───────┐
165
+ # # │ foo │
166
+ # # │ --- │
167
+ # # │ str │
168
+ # # ╞═══════╡
169
+ # # │ lead │
170
+ # # ├╌╌╌╌╌╌╌┤
171
+ # # │ trail │
172
+ # # ├╌╌╌╌╌╌╌┤
173
+ # # │ both │
174
+ # # └───────┘
32
175
  def strip(matches = nil)
33
176
  if !matches.nil? && matches.length > 1
34
177
  raise ArgumentError, "matches should contain a single character"
@@ -36,6 +179,29 @@ module Polars
36
179
  Utils.wrap_expr(_rbexpr.str_strip(matches))
37
180
  end
38
181
 
182
+ # Remove leading whitespace.
183
+ #
184
+ # @param matches [String, nil]
185
+ # An optional single character that should be trimmed.
186
+ #
187
+ # @return [Expr]
188
+ #
189
+ # @example
190
+ # df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
191
+ # df.select(Polars.col("foo").str.lstrip)
192
+ # # =>
193
+ # # shape: (3, 1)
194
+ # # ┌────────┐
195
+ # # │ foo │
196
+ # # │ --- │
197
+ # # │ str │
198
+ # # ╞════════╡
199
+ # # │ lead │
200
+ # # ├╌╌╌╌╌╌╌╌┤
201
+ # # │ trail │
202
+ # # ├╌╌╌╌╌╌╌╌┤
203
+ # # │ both │
204
+ # # └────────┘
39
205
  def lstrip(matches = nil)
40
206
  if !matches.nil? && matches.length > 1
41
207
  raise ArgumentError, "matches should contain a single character"
@@ -43,6 +209,29 @@ module Polars
43
209
  Utils.wrap_expr(_rbexpr.str_lstrip(matches))
44
210
  end
45
211
 
212
+ # Remove trailing whitespace.
213
+ #
214
+ # @param matches [String, nil]
215
+ # An optional single character that should be trimmed.
216
+ #
217
+ # @return [Expr]
218
+ #
219
+ # @example
220
+ # df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
221
+ # df.select(Polars.col("foo").str.rstrip)
222
+ # # =>
223
+ # # shape: (3, 1)
224
+ # # ┌───────┐
225
+ # # │ foo │
226
+ # # │ --- │
227
+ # # │ str │
228
+ # # ╞═══════╡
229
+ # # │ lead │
230
+ # # ├╌╌╌╌╌╌╌┤
231
+ # # │ trail │
232
+ # # ├╌╌╌╌╌╌╌┤
233
+ # # │ both │
234
+ # # └───────┘
46
235
  def rstrip(matches = nil)
47
236
  if !matches.nil? && matches.length > 1
48
237
  raise ArgumentError, "matches should contain a single character"
@@ -50,26 +239,241 @@ module Polars
50
239
  Utils.wrap_expr(_rbexpr.str_rstrip(matches))
51
240
  end
52
241
 
242
+ # Fills the string with zeroes.
243
+ #
244
+ # Return a copy of the string left filled with ASCII '0' digits to make a string
245
+ # of length width.
246
+ #
247
+ # A leading sign prefix ('+'/'-') is handled by inserting the padding after the
248
+ # sign character rather than before. The original string is returned if width is
249
+ # less than or equal to `s.length`.
250
+ #
251
+ # @param alignment [Integer]
252
+ # Fill the value up to this length
253
+ #
254
+ # @return [Expr]
255
+ #
256
+ # @example
257
+ # df = Polars::DataFrame.new(
258
+ # {
259
+ # "num" => [-10, -1, 0, 1, 10, 100, 1000, 10000, 100000, 1000000, nil]
260
+ # }
261
+ # )
262
+ # df.with_column(Polars.col("num").cast(String).str.zfill(5))
263
+ # # =>
264
+ # # shape: (11, 1)
265
+ # # ┌─────────┐
266
+ # # │ num │
267
+ # # │ --- │
268
+ # # │ str │
269
+ # # ╞═════════╡
270
+ # # │ -0010 │
271
+ # # ├╌╌╌╌╌╌╌╌╌┤
272
+ # # │ -0001 │
273
+ # # ├╌╌╌╌╌╌╌╌╌┤
274
+ # # │ 00000 │
275
+ # # ├╌╌╌╌╌╌╌╌╌┤
276
+ # # │ 00001 │
277
+ # # ├╌╌╌╌╌╌╌╌╌┤
278
+ # # │ ... │
279
+ # # ├╌╌╌╌╌╌╌╌╌┤
280
+ # # │ 10000 │
281
+ # # ├╌╌╌╌╌╌╌╌╌┤
282
+ # # │ 100000 │
283
+ # # ├╌╌╌╌╌╌╌╌╌┤
284
+ # # │ 1000000 │
285
+ # # ├╌╌╌╌╌╌╌╌╌┤
286
+ # # │ null │
287
+ # # └─────────┘
53
288
  def zfill(alignment)
54
289
  Utils.wrap_expr(_rbexpr.str_zfill(alignment))
55
290
  end
56
291
 
292
+ # Return the string left justified in a string of length `width`.
293
+ #
294
+ # Padding is done using the specified `fillcha``.
295
+ # The original string is returned if `width` is less than or equal to
296
+ # `s.length`.
297
+ #
298
+ # @param width [Integer]
299
+ # Justify left to this length.
300
+ # @param fillchar [String]
301
+ # Fill with this ASCII character.
302
+ #
303
+ # @return [Expr]
304
+ #
305
+ # @example
306
+ # df = Polars::DataFrame.new({"a" => ["cow", "monkey", nil, "hippopotamus"]})
307
+ # df.select(Polars.col("a").str.ljust(8, "*"))
308
+ # # =>
309
+ # # shape: (4, 1)
310
+ # # ┌──────────────┐
311
+ # # │ a │
312
+ # # │ --- │
313
+ # # │ str │
314
+ # # ╞══════════════╡
315
+ # # │ cow***** │
316
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
317
+ # # │ monkey** │
318
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
319
+ # # │ null │
320
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
321
+ # # │ hippopotamus │
322
+ # # └──────────────┘
57
323
  def ljust(width, fillchar = " ")
58
324
  Utils.wrap_expr(_rbexpr.str_ljust(width, fillchar))
59
325
  end
60
326
 
327
+ # Return the string right justified in a string of length ``width``.
328
+ #
329
+ # Padding is done using the specified `fillchar`.
330
+ # The original string is returned if `width` is less than or equal to
331
+ # `s.length`.
332
+ #
333
+ # @param width [Integer]
334
+ # Justify right to this length.
335
+ # @param fillchar [String]
336
+ # Fill with this ASCII character.
337
+ #
338
+ # @return [Expr]
339
+ #
340
+ # @example
341
+ # df = Polars::DataFrame.new({"a" => ["cow", "monkey", nil, "hippopotamus"]})
342
+ # df.select(Polars.col("a").str.rjust(8, "*"))
343
+ # # =>
344
+ # # shape: (4, 1)
345
+ # # ┌──────────────┐
346
+ # # │ a │
347
+ # # │ --- │
348
+ # # │ str │
349
+ # # ╞══════════════╡
350
+ # # │ *****cow │
351
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
352
+ # # │ **monkey │
353
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
354
+ # # │ null │
355
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
356
+ # # │ hippopotamus │
357
+ # # └──────────────┘
61
358
  def rjust(width, fillchar = " ")
62
359
  Utils.wrap_expr(_rbexpr.str_rjust(width, fillchar))
63
360
  end
64
361
 
362
+ # Check if string contains a substring that matches a regex.
363
+ #
364
+ # @param pattern [String]
365
+ # A valid regex pattern.
366
+ # @param literal [Boolean]
367
+ # Treat pattern as a literal string.
368
+ #
369
+ # @return [Expr]
370
+ #
371
+ # @example
372
+ # df = Polars::DataFrame.new({"a" => ["Crab", "cat and dog", "rab$bit", nil]})
373
+ # df.select(
374
+ # [
375
+ # Polars.col("a"),
376
+ # Polars.col("a").str.contains("cat|bit").alias("regex"),
377
+ # Polars.col("a").str.contains("rab$", literal: true).alias("literal")
378
+ # ]
379
+ # )
380
+ # # =>
381
+ # # shape: (4, 3)
382
+ # # ┌─────────────┬───────┬─────────┐
383
+ # # │ a ┆ regex ┆ literal │
384
+ # # │ --- ┆ --- ┆ --- │
385
+ # # │ str ┆ bool ┆ bool │
386
+ # # ╞═════════════╪═══════╪═════════╡
387
+ # # │ Crab ┆ false ┆ false │
388
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
389
+ # # │ cat and dog ┆ true ┆ false │
390
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
391
+ # # │ rab$bit ┆ true ┆ true │
392
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
393
+ # # │ null ┆ null ┆ null │
394
+ # # └─────────────┴───────┴─────────┘
65
395
  def contains(pattern, literal: false)
66
396
  Utils.wrap_expr(_rbexpr.str_contains(pattern, literal))
67
397
  end
68
398
 
399
+ # Check if string values end with a substring.
400
+ #
401
+ # @param sub [String]
402
+ # Suffix substring.
403
+ #
404
+ # @return [Expr]
405
+ #
406
+ # @example
407
+ # df = Polars::DataFrame.new({"fruits" => ["apple", "mango", nil]})
408
+ # df.with_column(
409
+ # Polars.col("fruits").str.ends_with("go").alias("has_suffix")
410
+ # )
411
+ # # =>
412
+ # # shape: (3, 2)
413
+ # # ┌────────┬────────────┐
414
+ # # │ fruits ┆ has_suffix │
415
+ # # │ --- ┆ --- │
416
+ # # │ str ┆ bool │
417
+ # # ╞════════╪════════════╡
418
+ # # │ apple ┆ false │
419
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
420
+ # # │ mango ┆ true │
421
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
422
+ # # │ null ┆ null │
423
+ # # └────────┴────────────┘
424
+ #
425
+ # @example Using `ends_with` as a filter condition:
426
+ # df.filter(Polars.col("fruits").str.ends_with("go"))
427
+ # # =>
428
+ # # shape: (1, 1)
429
+ # # ┌────────┐
430
+ # # │ fruits │
431
+ # # │ --- │
432
+ # # │ str │
433
+ # # ╞════════╡
434
+ # # │ mango │
435
+ # # └────────┘
69
436
  def ends_with(sub)
70
437
  Utils.wrap_expr(_rbexpr.str_ends_with(sub))
71
438
  end
72
439
 
440
+ # Check if string values start with a substring.
441
+ #
442
+ # @param sub [String]
443
+ # Prefix substring.
444
+ #
445
+ # @return [Expr]
446
+ #
447
+ # @example
448
+ # df = Polars::DataFrame.new({"fruits" => ["apple", "mango", nil]})
449
+ # df.with_column(
450
+ # Polars.col("fruits").str.starts_with("app").alias("has_prefix")
451
+ # )
452
+ # # =>
453
+ # # shape: (3, 2)
454
+ # # ┌────────┬────────────┐
455
+ # # │ fruits ┆ has_prefix │
456
+ # # │ --- ┆ --- │
457
+ # # │ str ┆ bool │
458
+ # # ╞════════╪════════════╡
459
+ # # │ apple ┆ true │
460
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
461
+ # # │ mango ┆ false │
462
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
463
+ # # │ null ┆ null │
464
+ # # └────────┴────────────┘
465
+ #
466
+ # @example Using `starts_with` as a filter condition:
467
+ # df.filter(Polars.col("fruits").str.starts_with("app"))
468
+ # # =>
469
+ # # shape: (1, 1)
470
+ # # ┌────────┐
471
+ # # │ fruits │
472
+ # # │ --- │
473
+ # # │ str │
474
+ # # ╞════════╡
475
+ # # │ apple │
476
+ # # └────────┘
73
477
  def starts_with(sub)
74
478
  Utils.wrap_expr(_rbexpr.str_starts_with(sub))
75
479
  end
@@ -83,18 +487,125 @@ module Polars
83
487
  # def encode
84
488
  # end
85
489
 
490
+ # Extract the target capture group from provided patterns.
491
+ #
492
+ # @param pattern [String]
493
+ # A valid regex pattern
494
+ # @param group_index [Integer]
495
+ # Index of the targeted capture group.
496
+ # Group 0 mean the whole pattern, first group begin at index 1
497
+ # Default to the first capture group
498
+ #
499
+ # @return [Expr]
500
+ #
501
+ # @example
502
+ # df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
503
+ # df.select(
504
+ # [
505
+ # Polars.col("foo").str.extract('(\d+)')
506
+ # ]
507
+ # )
508
+ # # =>
509
+ # # shape: (2, 1)
510
+ # # ┌─────┐
511
+ # # │ foo │
512
+ # # │ --- │
513
+ # # │ str │
514
+ # # ╞═════╡
515
+ # # │ 123 │
516
+ # # ├╌╌╌╌╌┤
517
+ # # │ 678 │
518
+ # # └─────┘
86
519
  def extract(pattern, group_index: 1)
87
520
  Utils.wrap_expr(_rbexpr.str_extract(pattern, group_index))
88
521
  end
89
522
 
523
+ # Extracts all matches for the given regex pattern.
524
+ #
525
+ # Extracts each successive non-overlapping regex match in an individual string as
526
+ # an array.
527
+ #
528
+ # @param pattern [String]
529
+ # A valid regex pattern
530
+ #
531
+ # @return [Expr]
532
+ #
533
+ # @example
534
+ # df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
535
+ # df.select(
536
+ # [
537
+ # Polars.col("foo").str.extract_all('(\d+)').alias("extracted_nrs")
538
+ # ]
539
+ # )
540
+ # # =>
541
+ # # shape: (2, 1)
542
+ # # ┌────────────────┐
543
+ # # │ extracted_nrs │
544
+ # # │ --- │
545
+ # # │ list[str] │
546
+ # # ╞════════════════╡
547
+ # # │ ["123", "45"] │
548
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
549
+ # # │ ["678", "910"] │
550
+ # # └────────────────┘
90
551
  def extract_all(pattern)
91
552
  Utils.wrap_expr(_rbexpr.str_extract_all(pattern))
92
553
  end
93
554
 
555
+ # Count all successive non-overlapping regex matches.
556
+ #
557
+ # @param pattern [String]
558
+ # A valid regex pattern
559
+ #
560
+ # @return [Expr]
561
+ #
562
+ # @example
563
+ # df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
564
+ # df.select(
565
+ # [
566
+ # Polars.col("foo").str.count_match('\d').alias("count_digits")
567
+ # ]
568
+ # )
569
+ # # =>
570
+ # # shape: (2, 1)
571
+ # # ┌──────────────┐
572
+ # # │ count_digits │
573
+ # # │ --- │
574
+ # # │ u32 │
575
+ # # ╞══════════════╡
576
+ # # │ 5 │
577
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
578
+ # # │ 6 │
579
+ # # └──────────────┘
94
580
  def count_match(pattern)
95
581
  Utils.wrap_expr(_rbexpr.count_match(pattern))
96
582
  end
97
583
 
584
+ # Split the string by a substring.
585
+ #
586
+ # @param by [String]
587
+ # Substring to split by.
588
+ # @param inclusive [Boolean]
589
+ # If true, include the split character/string in the results.
590
+ #
591
+ # @return [Expr]
592
+ #
593
+ # @example
594
+ # df = Polars::DataFrame.new({"s" => ["foo bar", "foo-bar", "foo bar baz"]})
595
+ # df.select(Polars.col("s").str.split(" "))
596
+ # # =>
597
+ # # shape: (3, 1)
598
+ # # ┌───────────────────────┐
599
+ # # │ s │
600
+ # # │ --- │
601
+ # # │ list[str] │
602
+ # # ╞═══════════════════════╡
603
+ # # │ ["foo", "bar"] │
604
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
605
+ # # │ ["foo-bar"] │
606
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
607
+ # # │ ["foo", "bar", "baz"] │
608
+ # # └───────────────────────┘
98
609
  def split(by, inclusive: false)
99
610
  if inclusive
100
611
  Utils.wrap_expr(_rbexpr.str_split_inclusive(by))
@@ -103,6 +614,43 @@ module Polars
103
614
  end
104
615
  end
105
616
 
617
+ # Split the string by a substring using `n` splits.
618
+ #
619
+ # Results in a struct of `n+1` fields.
620
+ #
621
+ # If it cannot make `n` splits, the remaining field elements will be null.
622
+ #
623
+ # @param by [String]
624
+ # Substring to split by.
625
+ # @param n [Integer]
626
+ # Number of splits to make.
627
+ # @param inclusive [Boolean]
628
+ # If true, include the split character/string in the results.
629
+ #
630
+ # @return [Expr]
631
+ #
632
+ # @example
633
+ # df = Polars::DataFrame.new({"x" => ["a_1", nil, "c", "d_4"]})
634
+ # df.select(
635
+ # [
636
+ # Polars.col("x").str.split_exact("_", 1).alias("fields")
637
+ # ]
638
+ # )
639
+ # # =>
640
+ # # shape: (4, 1)
641
+ # # ┌─────────────┐
642
+ # # │ fields │
643
+ # # │ --- │
644
+ # # │ struct[2] │
645
+ # # ╞═════════════╡
646
+ # # │ {"a","1"} │
647
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┤
648
+ # # │ {null,null} │
649
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┤
650
+ # # │ {"c",null} │
651
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┤
652
+ # # │ {"d","4"} │
653
+ # # └─────────────┘
106
654
  def split_exact(by, n, inclusive: false)
107
655
  if inclusive
108
656
  Utils.wrap_expr(_rbexpr.str_split_exact_inclusive(by, n))
@@ -111,22 +659,135 @@ module Polars
111
659
  end
112
660
  end
113
661
 
662
+ # Split the string by a substring, restricted to returning at most ``n`` items.
663
+ #
664
+ # If the number of possible splits is less than ``n-1``, the remaining field
665
+ # elements will be null. If the number of possible splits is ``n-1`` or greater,
666
+ # the last (nth) substring will contain the remainder of the string.
667
+ #
668
+ # @param by [String]
669
+ # Substring to split by.
670
+ # @param n [Integer]
671
+ # Max number of items to return.
672
+ #
673
+ # @return [Expr]
674
+ #
675
+ # @example
676
+ # df = Polars::DataFrame.new({"s" => ["foo bar", nil, "foo-bar", "foo bar baz"]})
677
+ # df.select(Polars.col("s").str.splitn(" ", 2).alias("fields"))
678
+ # # =>
679
+ # # shape: (4, 1)
680
+ # # ┌───────────────────┐
681
+ # # │ fields │
682
+ # # │ --- │
683
+ # # │ struct[2] │
684
+ # # ╞═══════════════════╡
685
+ # # │ {"foo","bar"} │
686
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
687
+ # # │ {null,null} │
688
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
689
+ # # │ {"foo-bar",null} │
690
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
691
+ # # │ {"foo","bar baz"} │
692
+ # # └───────────────────┘
114
693
  def splitn(by, n)
115
694
  Utils.wrap_expr(_rbexpr.str_splitn(by, n))
116
695
  end
117
696
 
118
- def replace(pattern, literal: false)
697
+ # Replace first matching regex/literal substring with a new string value.
698
+ #
699
+ # @param pattern [String]
700
+ # Regex pattern.
701
+ # @param value [String]
702
+ # Replacement string.
703
+ # @param literal [Boolean]
704
+ # Treat pattern as a literal string.
705
+ #
706
+ # @return [Expr]
707
+ #
708
+ # @example
709
+ # df = Polars::DataFrame.new({"id" => [1, 2], "text" => ["123abc", "abc456"]})
710
+ # df.with_column(
711
+ # Polars.col("text").str.replace('abc\b', "ABC")
712
+ # )
713
+ # # =>
714
+ # # shape: (2, 2)
715
+ # # ┌─────┬────────┐
716
+ # # │ id ┆ text │
717
+ # # │ --- ┆ --- │
718
+ # # │ i64 ┆ str │
719
+ # # ╞═════╪════════╡
720
+ # # │ 1 ┆ 123ABC │
721
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
722
+ # # │ 2 ┆ abc456 │
723
+ # # └─────┴────────┘
724
+ def replace(pattern, value, literal: false)
119
725
  pattern = Utils.expr_to_lit_or_expr(pattern, str_to_lit: true)
120
726
  value = Utils.expr_to_lit_or_expr(value, str_to_lit: true)
121
727
  Utils.wrap_expr(_rbexpr.str_replace(pattern._rbexpr, value._rbexpr, literal))
122
728
  end
123
729
 
124
- def replace_all(pattern, literal: false)
730
+ # Replace all matching regex/literal substrings with a new string value.
731
+ #
732
+ # @param pattern [String]
733
+ # Regex pattern.
734
+ # @param value [String]
735
+ # Replacement string.
736
+ # @param literal [Boolean]
737
+ # Treat pattern as a literal string.
738
+ #
739
+ # @return [Expr]
740
+ #
741
+ # @example
742
+ # df = Polars::DataFrame.new({"id" => [1, 2], "text" => ["abcabc", "123a123"]})
743
+ # df.with_column(Polars.col("text").str.replace_all("a", "-"))
744
+ # # =>
745
+ # # shape: (2, 2)
746
+ # # ┌─────┬─────────┐
747
+ # # │ id ┆ text │
748
+ # # │ --- ┆ --- │
749
+ # # │ i64 ┆ str │
750
+ # # ╞═════╪═════════╡
751
+ # # │ 1 ┆ -bc-bc │
752
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
753
+ # # │ 2 ┆ 123-123 │
754
+ # # └─────┴─────────┘
755
+ def replace_all(pattern, value, literal: false)
125
756
  pattern = Utils.expr_to_lit_or_expr(pattern, str_to_lit: true)
126
757
  value = Utils.expr_to_lit_or_expr(value, str_to_lit: true)
127
758
  Utils.wrap_expr(_rbexpr.str_replace_all(pattern._rbexpr, value._rbexpr, literal))
128
759
  end
129
760
 
761
+ # Create subslices of the string values of a Utf8 Series.
762
+ #
763
+ # @param offset [Integer]
764
+ # Start index. Negative indexing is supported.
765
+ # @param length [Integer]
766
+ # Length of the slice. If set to `nil` (default), the slice is taken to the
767
+ # end of the string.
768
+ #
769
+ # @return [Expr]
770
+ #
771
+ # @example
772
+ # df = Polars::DataFrame.new({"s" => ["pear", nil, "papaya", "dragonfruit"]})
773
+ # df.with_column(
774
+ # Polars.col("s").str.slice(-3).alias("s_sliced")
775
+ # )
776
+ # # =>
777
+ # # shape: (4, 2)
778
+ # # ┌─────────────┬──────────┐
779
+ # # │ s ┆ s_sliced │
780
+ # # │ --- ┆ --- │
781
+ # # │ str ┆ str │
782
+ # # ╞═════════════╪══════════╡
783
+ # # │ pear ┆ ear │
784
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
785
+ # # │ null ┆ null │
786
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
787
+ # # │ papaya ┆ aya │
788
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
789
+ # # │ dragonfruit ┆ uit │
790
+ # # └─────────────┴──────────┘
130
791
  def slice(offset, length = nil)
131
792
  Utils.wrap_expr(_rbexpr.str_slice(offset, length))
132
793
  end