polars-df 0.2.0-arm64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +3 -0
  3. data/CHANGELOG.md +33 -0
  4. data/Cargo.lock +2230 -0
  5. data/Cargo.toml +10 -0
  6. data/LICENSE-THIRD-PARTY.txt +38856 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +91 -0
  9. data/lib/polars/3.0/polars.bundle +0 -0
  10. data/lib/polars/3.1/polars.bundle +0 -0
  11. data/lib/polars/3.2/polars.bundle +0 -0
  12. data/lib/polars/batched_csv_reader.rb +96 -0
  13. data/lib/polars/cat_expr.rb +52 -0
  14. data/lib/polars/cat_name_space.rb +54 -0
  15. data/lib/polars/convert.rb +100 -0
  16. data/lib/polars/data_frame.rb +4833 -0
  17. data/lib/polars/data_types.rb +122 -0
  18. data/lib/polars/date_time_expr.rb +1418 -0
  19. data/lib/polars/date_time_name_space.rb +1484 -0
  20. data/lib/polars/dynamic_group_by.rb +52 -0
  21. data/lib/polars/exceptions.rb +20 -0
  22. data/lib/polars/expr.rb +5307 -0
  23. data/lib/polars/expr_dispatch.rb +22 -0
  24. data/lib/polars/functions.rb +453 -0
  25. data/lib/polars/group_by.rb +558 -0
  26. data/lib/polars/io.rb +814 -0
  27. data/lib/polars/lazy_frame.rb +2442 -0
  28. data/lib/polars/lazy_functions.rb +1195 -0
  29. data/lib/polars/lazy_group_by.rb +93 -0
  30. data/lib/polars/list_expr.rb +610 -0
  31. data/lib/polars/list_name_space.rb +346 -0
  32. data/lib/polars/meta_expr.rb +54 -0
  33. data/lib/polars/rolling_group_by.rb +35 -0
  34. data/lib/polars/series.rb +3730 -0
  35. data/lib/polars/slice.rb +104 -0
  36. data/lib/polars/string_expr.rb +972 -0
  37. data/lib/polars/string_name_space.rb +690 -0
  38. data/lib/polars/struct_expr.rb +100 -0
  39. data/lib/polars/struct_name_space.rb +64 -0
  40. data/lib/polars/utils.rb +192 -0
  41. data/lib/polars/version.rb +4 -0
  42. data/lib/polars/when.rb +16 -0
  43. data/lib/polars/when_then.rb +19 -0
  44. data/lib/polars-df.rb +1 -0
  45. data/lib/polars.rb +50 -0
  46. metadata +89 -0
@@ -0,0 +1,972 @@
1
+ module Polars
2
+ # Namespace for string related expressions.
3
+ class StringExpr
4
+ # @private
5
+ attr_accessor :_rbexpr
6
+
7
+ # @private
8
+ def initialize(expr)
9
+ self._rbexpr = expr._rbexpr
10
+ end
11
+
12
+ # Parse a Utf8 expression to a Date/Datetime/Time type.
13
+ #
14
+ # @param datatype [Symbol]
15
+ # `:date`, `:dateime`, or `:time`.
16
+ # @param fmt [String]
17
+ # Format to use, refer to the
18
+ # [chrono strftime documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
19
+ # for specification. Example: `"%y-%m-%d"`.
20
+ # @param strict [Boolean]
21
+ # Raise an error if any conversion fails.
22
+ # @param exact [Boolean]
23
+ # - If true, require an exact format match.
24
+ # - If false, allow the format to match anywhere in the target string.
25
+ #
26
+ # @return [Expr]
27
+ #
28
+ # @note
29
+ # When parsing a Datetime the column precision will be inferred from
30
+ # the format string, if given, eg: "%F %T%.3f" => Datetime("ms"). If
31
+ # no fractional second component is found then the default is "us".
32
+ #
33
+ # @example
34
+ # s = Polars::Series.new(
35
+ # "date",
36
+ # [
37
+ # "2021-04-22",
38
+ # "2022-01-04 00:00:00",
39
+ # "01/31/22",
40
+ # "Sun Jul 8 00:34:60 2001"
41
+ # ]
42
+ # )
43
+ # s.to_frame.with_column(
44
+ # Polars.col("date")
45
+ # .str.strptime(:date, "%F", strict: false)
46
+ # .fill_null(
47
+ # Polars.col("date").str.strptime(:date, "%F %T", strict: false)
48
+ # )
49
+ # .fill_null(Polars.col("date").str.strptime(:date, "%D", strict: false))
50
+ # .fill_null(Polars.col("date").str.strptime(:date, "%c", strict: false))
51
+ # )
52
+ # # =>
53
+ # # shape: (4, 1)
54
+ # # ┌────────────┐
55
+ # # │ date │
56
+ # # │ --- │
57
+ # # │ date │
58
+ # # ╞════════════╡
59
+ # # │ 2021-04-22 │
60
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┤
61
+ # # │ 2022-01-04 │
62
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┤
63
+ # # │ 2022-01-31 │
64
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┤
65
+ # # │ 2001-07-08 │
66
+ # # └────────────┘
67
+ def strptime(datatype, fmt = nil, strict: true, exact: true, cache: true, tz_aware: false)
68
+ if !Utils.is_polars_dtype(datatype)
69
+ raise ArgumentError, "expected: {DataType} got: #{datatype}"
70
+ end
71
+
72
+ if datatype == :date
73
+ Utils.wrap_expr(_rbexpr.str_parse_date(fmt, strict, exact, cache))
74
+ elsif datatype == :datetime
75
+ # TODO fix
76
+ tu = nil # datatype.tu
77
+ dtcol = Utils.wrap_expr(_rbexpr.str_parse_datetime(fmt, strict, exact, cache, tz_aware))
78
+ if tu.nil?
79
+ dtcol
80
+ else
81
+ dtcol.dt.cast_time_unit(tu)
82
+ end
83
+ elsif datatype == :time
84
+ Utils.wrap_expr(_rbexpr.str_parse_time(fmt, strict, exact, cache))
85
+ else
86
+ raise ArgumentError, "dtype should be of type :date, :datetime, or :time"
87
+ end
88
+ end
89
+
90
+ # Get length of the strings as `:u32` (as number of bytes).
91
+ #
92
+ # @return [Expr]
93
+ #
94
+ # @note
95
+ # The returned lengths are equal to the number of bytes in the UTF8 string. If you
96
+ # need the length in terms of the number of characters, use `n_chars` instead.
97
+ #
98
+ # @example
99
+ # df = Polars::DataFrame.new({"s" => ["Café", nil, "345", "東京"]}).with_columns(
100
+ # [
101
+ # Polars.col("s").str.lengths.alias("length"),
102
+ # Polars.col("s").str.n_chars.alias("nchars")
103
+ # ]
104
+ # )
105
+ # df
106
+ # # =>
107
+ # # shape: (4, 3)
108
+ # # ┌──────┬────────┬────────┐
109
+ # # │ s ┆ length ┆ nchars │
110
+ # # │ --- ┆ --- ┆ --- │
111
+ # # │ str ┆ u32 ┆ u32 │
112
+ # # ╞══════╪════════╪════════╡
113
+ # # │ Café ┆ 5 ┆ 4 │
114
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
115
+ # # │ null ┆ null ┆ null │
116
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
117
+ # # │ 345 ┆ 3 ┆ 3 │
118
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
119
+ # # │ 東京 ┆ 6 ┆ 2 │
120
+ # # └──────┴────────┴────────┘
121
+ def lengths
122
+ Utils.wrap_expr(_rbexpr.str_lengths)
123
+ end
124
+
125
+ # Get length of the strings as `:u32` (as number of chars).
126
+ #
127
+ # @return [Expr]
128
+ #
129
+ # @note
130
+ # If you know that you are working with ASCII text, `lengths` will be
131
+ # equivalent, and faster (returns length in terms of the number of bytes).
132
+ #
133
+ # @example
134
+ # df = Polars::DataFrame.new({"s" => ["Café", nil, "345", "東京"]}).with_columns(
135
+ # [
136
+ # Polars.col("s").str.lengths.alias("length"),
137
+ # Polars.col("s").str.n_chars.alias("nchars")
138
+ # ]
139
+ # )
140
+ # df
141
+ # # =>
142
+ # # shape: (4, 3)
143
+ # # ┌──────┬────────┬────────┐
144
+ # # │ s ┆ length ┆ nchars │
145
+ # # │ --- ┆ --- ┆ --- │
146
+ # # │ str ┆ u32 ┆ u32 │
147
+ # # ╞══════╪════════╪════════╡
148
+ # # │ Café ┆ 5 ┆ 4 │
149
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
150
+ # # │ null ┆ null ┆ null │
151
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
152
+ # # │ 345 ┆ 3 ┆ 3 │
153
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
154
+ # # │ 東京 ┆ 6 ┆ 2 │
155
+ # # └──────┴────────┴────────┘
156
+ def n_chars
157
+ Utils.wrap_expr(_rbexpr.str_n_chars)
158
+ end
159
+
160
+ # Vertically concat the values in the Series to a single string value.
161
+ #
162
+ # @param delimiter [String]
163
+ # The delimiter to insert between consecutive string values.
164
+ #
165
+ # @return [Expr]
166
+ #
167
+ # @example
168
+ # df = Polars::DataFrame.new({"foo" => [1, nil, 2]})
169
+ # df.select(Polars.col("foo").str.concat("-"))
170
+ # # =>
171
+ # # shape: (1, 1)
172
+ # # ┌──────────┐
173
+ # # │ foo │
174
+ # # │ --- │
175
+ # # │ str │
176
+ # # ╞══════════╡
177
+ # # │ 1-null-2 │
178
+ # # └──────────┘
179
+ def concat(delimiter = "-")
180
+ Utils.wrap_expr(_rbexpr.str_concat(delimiter))
181
+ end
182
+
183
+ # Transform to uppercase variant.
184
+ #
185
+ # @return [Expr]
186
+ #
187
+ # @example
188
+ # df = Polars::DataFrame.new({"foo" => ["cat", "dog"]})
189
+ # df.select(Polars.col("foo").str.to_uppercase)
190
+ # # =>
191
+ # # shape: (2, 1)
192
+ # # ┌─────┐
193
+ # # │ foo │
194
+ # # │ --- │
195
+ # # │ str │
196
+ # # ╞═════╡
197
+ # # │ CAT │
198
+ # # ├╌╌╌╌╌┤
199
+ # # │ DOG │
200
+ # # └─────┘
201
+ def to_uppercase
202
+ Utils.wrap_expr(_rbexpr.str_to_uppercase)
203
+ end
204
+
205
+ # Transform to lowercase variant.
206
+ #
207
+ # @return [Expr]
208
+ #
209
+ # @example
210
+ # df = Polars::DataFrame.new({"foo" => ["CAT", "DOG"]})
211
+ # df.select(Polars.col("foo").str.to_lowercase)
212
+ # # =>
213
+ # # shape: (2, 1)
214
+ # # ┌─────┐
215
+ # # │ foo │
216
+ # # │ --- │
217
+ # # │ str │
218
+ # # ╞═════╡
219
+ # # │ cat │
220
+ # # ├╌╌╌╌╌┤
221
+ # # │ dog │
222
+ # # └─────┘
223
+ def to_lowercase
224
+ Utils.wrap_expr(_rbexpr.str_to_lowercase)
225
+ end
226
+
227
+ # Remove leading and trailing whitespace.
228
+ #
229
+ # @param matches [String, nil]
230
+ # An optional single character that should be trimmed.
231
+ #
232
+ # @return [Expr]
233
+ #
234
+ # @example
235
+ # df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
236
+ # df.select(Polars.col("foo").str.strip)
237
+ # # =>
238
+ # # shape: (3, 1)
239
+ # # ┌───────┐
240
+ # # │ foo │
241
+ # # │ --- │
242
+ # # │ str │
243
+ # # ╞═══════╡
244
+ # # │ lead │
245
+ # # ├╌╌╌╌╌╌╌┤
246
+ # # │ trail │
247
+ # # ├╌╌╌╌╌╌╌┤
248
+ # # │ both │
249
+ # # └───────┘
250
+ def strip(matches = nil)
251
+ if !matches.nil? && matches.length > 1
252
+ raise ArgumentError, "matches should contain a single character"
253
+ end
254
+ Utils.wrap_expr(_rbexpr.str_strip(matches))
255
+ end
256
+
257
+ # Remove leading whitespace.
258
+ #
259
+ # @param matches [String, nil]
260
+ # An optional single character that should be trimmed.
261
+ #
262
+ # @return [Expr]
263
+ #
264
+ # @example
265
+ # df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
266
+ # df.select(Polars.col("foo").str.lstrip)
267
+ # # =>
268
+ # # shape: (3, 1)
269
+ # # ┌────────┐
270
+ # # │ foo │
271
+ # # │ --- │
272
+ # # │ str │
273
+ # # ╞════════╡
274
+ # # │ lead │
275
+ # # ├╌╌╌╌╌╌╌╌┤
276
+ # # │ trail │
277
+ # # ├╌╌╌╌╌╌╌╌┤
278
+ # # │ both │
279
+ # # └────────┘
280
+ def lstrip(matches = nil)
281
+ if !matches.nil? && matches.length > 1
282
+ raise ArgumentError, "matches should contain a single character"
283
+ end
284
+ Utils.wrap_expr(_rbexpr.str_lstrip(matches))
285
+ end
286
+
287
+ # Remove trailing whitespace.
288
+ #
289
+ # @param matches [String, nil]
290
+ # An optional single character that should be trimmed.
291
+ #
292
+ # @return [Expr]
293
+ #
294
+ # @example
295
+ # df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
296
+ # df.select(Polars.col("foo").str.rstrip)
297
+ # # =>
298
+ # # shape: (3, 1)
299
+ # # ┌───────┐
300
+ # # │ foo │
301
+ # # │ --- │
302
+ # # │ str │
303
+ # # ╞═══════╡
304
+ # # │ lead │
305
+ # # ├╌╌╌╌╌╌╌┤
306
+ # # │ trail │
307
+ # # ├╌╌╌╌╌╌╌┤
308
+ # # │ both │
309
+ # # └───────┘
310
+ def rstrip(matches = nil)
311
+ if !matches.nil? && matches.length > 1
312
+ raise ArgumentError, "matches should contain a single character"
313
+ end
314
+ Utils.wrap_expr(_rbexpr.str_rstrip(matches))
315
+ end
316
+
317
+ # Fills the string with zeroes.
318
+ #
319
+ # Return a copy of the string left filled with ASCII '0' digits to make a string
320
+ # of length width.
321
+ #
322
+ # A leading sign prefix ('+'/'-') is handled by inserting the padding after the
323
+ # sign character rather than before. The original string is returned if width is
324
+ # less than or equal to `s.length`.
325
+ #
326
+ # @param alignment [Integer]
327
+ # Fill the value up to this length
328
+ #
329
+ # @return [Expr]
330
+ #
331
+ # @example
332
+ # df = Polars::DataFrame.new(
333
+ # {
334
+ # "num" => [-10, -1, 0, 1, 10, 100, 1000, 10000, 100000, 1000000, nil]
335
+ # }
336
+ # )
337
+ # df.with_column(Polars.col("num").cast(String).str.zfill(5))
338
+ # # =>
339
+ # # shape: (11, 1)
340
+ # # ┌─────────┐
341
+ # # │ num │
342
+ # # │ --- │
343
+ # # │ str │
344
+ # # ╞═════════╡
345
+ # # │ -0010 │
346
+ # # ├╌╌╌╌╌╌╌╌╌┤
347
+ # # │ -0001 │
348
+ # # ├╌╌╌╌╌╌╌╌╌┤
349
+ # # │ 00000 │
350
+ # # ├╌╌╌╌╌╌╌╌╌┤
351
+ # # │ 00001 │
352
+ # # ├╌╌╌╌╌╌╌╌╌┤
353
+ # # │ ... │
354
+ # # ├╌╌╌╌╌╌╌╌╌┤
355
+ # # │ 10000 │
356
+ # # ├╌╌╌╌╌╌╌╌╌┤
357
+ # # │ 100000 │
358
+ # # ├╌╌╌╌╌╌╌╌╌┤
359
+ # # │ 1000000 │
360
+ # # ├╌╌╌╌╌╌╌╌╌┤
361
+ # # │ null │
362
+ # # └─────────┘
363
+ def zfill(alignment)
364
+ Utils.wrap_expr(_rbexpr.str_zfill(alignment))
365
+ end
366
+
367
+ # Return the string left justified in a string of length `width`.
368
+ #
369
+ # Padding is done using the specified `fillchar`.
370
+ # The original string is returned if `width` is less than or equal to
371
+ # `s.length`.
372
+ #
373
+ # @param width [Integer]
374
+ # Justify left to this length.
375
+ # @param fillchar [String]
376
+ # Fill with this ASCII character.
377
+ #
378
+ # @return [Expr]
379
+ #
380
+ # @example
381
+ # df = Polars::DataFrame.new({"a" => ["cow", "monkey", nil, "hippopotamus"]})
382
+ # df.select(Polars.col("a").str.ljust(8, "*"))
383
+ # # =>
384
+ # # shape: (4, 1)
385
+ # # ┌──────────────┐
386
+ # # │ a │
387
+ # # │ --- │
388
+ # # │ str │
389
+ # # ╞══════════════╡
390
+ # # │ cow***** │
391
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
392
+ # # │ monkey** │
393
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
394
+ # # │ null │
395
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
396
+ # # │ hippopotamus │
397
+ # # └──────────────┘
398
+ def ljust(width, fillchar = " ")
399
+ Utils.wrap_expr(_rbexpr.str_ljust(width, fillchar))
400
+ end
401
+
402
+ # Return the string right justified in a string of length `width`.
403
+ #
404
+ # Padding is done using the specified `fillchar`.
405
+ # The original string is returned if `width` is less than or equal to
406
+ # `s.length`.
407
+ #
408
+ # @param width [Integer]
409
+ # Justify right to this length.
410
+ # @param fillchar [String]
411
+ # Fill with this ASCII character.
412
+ #
413
+ # @return [Expr]
414
+ #
415
+ # @example
416
+ # df = Polars::DataFrame.new({"a" => ["cow", "monkey", nil, "hippopotamus"]})
417
+ # df.select(Polars.col("a").str.rjust(8, "*"))
418
+ # # =>
419
+ # # shape: (4, 1)
420
+ # # ┌──────────────┐
421
+ # # │ a │
422
+ # # │ --- │
423
+ # # │ str │
424
+ # # ╞══════════════╡
425
+ # # │ *****cow │
426
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
427
+ # # │ **monkey │
428
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
429
+ # # │ null │
430
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
431
+ # # │ hippopotamus │
432
+ # # └──────────────┘
433
+ def rjust(width, fillchar = " ")
434
+ Utils.wrap_expr(_rbexpr.str_rjust(width, fillchar))
435
+ end
436
+
437
+ # Check if string contains a substring that matches a regex.
438
+ #
439
+ # @param pattern [String]
440
+ # A valid regex pattern.
441
+ # @param literal [Boolean]
442
+ # Treat pattern as a literal string.
443
+ #
444
+ # @return [Expr]
445
+ #
446
+ # @example
447
+ # df = Polars::DataFrame.new({"a" => ["Crab", "cat and dog", "rab$bit", nil]})
448
+ # df.select(
449
+ # [
450
+ # Polars.col("a"),
451
+ # Polars.col("a").str.contains("cat|bit").alias("regex"),
452
+ # Polars.col("a").str.contains("rab$", literal: true).alias("literal")
453
+ # ]
454
+ # )
455
+ # # =>
456
+ # # shape: (4, 3)
457
+ # # ┌─────────────┬───────┬─────────┐
458
+ # # │ a ┆ regex ┆ literal │
459
+ # # │ --- ┆ --- ┆ --- │
460
+ # # │ str ┆ bool ┆ bool │
461
+ # # ╞═════════════╪═══════╪═════════╡
462
+ # # │ Crab ┆ false ┆ false │
463
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
464
+ # # │ cat and dog ┆ true ┆ false │
465
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
466
+ # # │ rab$bit ┆ true ┆ true │
467
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
468
+ # # │ null ┆ null ┆ null │
469
+ # # └─────────────┴───────┴─────────┘
470
+ def contains(pattern, literal: false)
471
+ Utils.wrap_expr(_rbexpr.str_contains(pattern, literal))
472
+ end
473
+
474
+ # Check if string values end with a substring.
475
+ #
476
+ # @param sub [String]
477
+ # Suffix substring.
478
+ #
479
+ # @return [Expr]
480
+ #
481
+ # @example
482
+ # df = Polars::DataFrame.new({"fruits" => ["apple", "mango", nil]})
483
+ # df.with_column(
484
+ # Polars.col("fruits").str.ends_with("go").alias("has_suffix")
485
+ # )
486
+ # # =>
487
+ # # shape: (3, 2)
488
+ # # ┌────────┬────────────┐
489
+ # # │ fruits ┆ has_suffix │
490
+ # # │ --- ┆ --- │
491
+ # # │ str ┆ bool │
492
+ # # ╞════════╪════════════╡
493
+ # # │ apple ┆ false │
494
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
495
+ # # │ mango ┆ true │
496
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
497
+ # # │ null ┆ null │
498
+ # # └────────┴────────────┘
499
+ #
500
+ # @example Using `ends_with` as a filter condition:
501
+ # df.filter(Polars.col("fruits").str.ends_with("go"))
502
+ # # =>
503
+ # # shape: (1, 1)
504
+ # # ┌────────┐
505
+ # # │ fruits │
506
+ # # │ --- │
507
+ # # │ str │
508
+ # # ╞════════╡
509
+ # # │ mango │
510
+ # # └────────┘
511
+ def ends_with(sub)
512
+ Utils.wrap_expr(_rbexpr.str_ends_with(sub))
513
+ end
514
+
515
+ # Check if string values start with a substring.
516
+ #
517
+ # @param sub [String]
518
+ # Prefix substring.
519
+ #
520
+ # @return [Expr]
521
+ #
522
+ # @example
523
+ # df = Polars::DataFrame.new({"fruits" => ["apple", "mango", nil]})
524
+ # df.with_column(
525
+ # Polars.col("fruits").str.starts_with("app").alias("has_prefix")
526
+ # )
527
+ # # =>
528
+ # # shape: (3, 2)
529
+ # # ┌────────┬────────────┐
530
+ # # │ fruits ┆ has_prefix │
531
+ # # │ --- ┆ --- │
532
+ # # │ str ┆ bool │
533
+ # # ╞════════╪════════════╡
534
+ # # │ apple ┆ true │
535
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
536
+ # # │ mango ┆ false │
537
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
538
+ # # │ null ┆ null │
539
+ # # └────────┴────────────┘
540
+ #
541
+ # @example Using `starts_with` as a filter condition:
542
+ # df.filter(Polars.col("fruits").str.starts_with("app"))
543
+ # # =>
544
+ # # shape: (1, 1)
545
+ # # ┌────────┐
546
+ # # │ fruits │
547
+ # # │ --- │
548
+ # # │ str │
549
+ # # ╞════════╡
550
+ # # │ apple │
551
+ # # └────────┘
552
+ def starts_with(sub)
553
+ Utils.wrap_expr(_rbexpr.str_starts_with(sub))
554
+ end
555
+
556
+ # Extract the first match of json string with provided JSONPath expression.
557
+ #
558
+ # Throw errors if encounter invalid json strings.
559
+ # All return value will be casted to Utf8 regardless of the original value.
560
+ #
561
+ # Documentation on JSONPath standard can be found
562
+ # [here](https://goessner.net/articles/JsonPath/).
563
+ #
564
+ # @param json_path [String]
565
+ # A valid JSON path query string.
566
+ #
567
+ # @return [Expr]
568
+ #
569
+ # @example
570
+ # df = Polars::DataFrame.new(
571
+ # {"json_val" => ['{"a":"1"}', nil, '{"a":2}', '{"a":2.1}', '{"a":true}']}
572
+ # )
573
+ # df.select(Polars.col("json_val").str.json_path_match("$.a"))
574
+ # # =>
575
+ # # shape: (5, 1)
576
+ # # ┌──────────┐
577
+ # # │ json_val │
578
+ # # │ --- │
579
+ # # │ str │
580
+ # # ╞══════════╡
581
+ # # │ 1 │
582
+ # # ├╌╌╌╌╌╌╌╌╌╌┤
583
+ # # │ null │
584
+ # # ├╌╌╌╌╌╌╌╌╌╌┤
585
+ # # │ 2 │
586
+ # # ├╌╌╌╌╌╌╌╌╌╌┤
587
+ # # │ 2.1 │
588
+ # # ├╌╌╌╌╌╌╌╌╌╌┤
589
+ # # │ true │
590
+ # # └──────────┘
591
+ def json_path_match(json_path)
592
+ Utils.wrap_expr(_rbexpr.str_json_path_match(json_path))
593
+ end
594
+
595
+ # Decode a value using the provided encoding.
596
+ #
597
+ # @param encoding ["hex", "base64"]
598
+ # The encoding to use.
599
+ # @param strict [Boolean]
600
+ # How to handle invalid inputs:
601
+ #
602
+ # - `true`: An error will be thrown if unable to decode a value.
603
+ # - `false`: Unhandled values will be replaced with `nil`.
604
+ #
605
+ # @return [Expr]
606
+ #
607
+ # @example
608
+ # df = Polars::DataFrame.new({"encoded" => ["666f6f", "626172", nil]})
609
+ # df.select(Polars.col("encoded").str.decode("hex"))
610
+ # # =>
611
+ # # shape: (3, 1)
612
+ # # ┌─────────┐
613
+ # # │ encoded │
614
+ # # │ --- │
615
+ # # │ str │
616
+ # # ╞═════════╡
617
+ # # │ foo │
618
+ # # ├╌╌╌╌╌╌╌╌╌┤
619
+ # # │ bar │
620
+ # # ├╌╌╌╌╌╌╌╌╌┤
621
+ # # │ null │
622
+ # # └─────────┘
623
+ def decode(encoding, strict: false)
624
+ if encoding == "hex"
625
+ Utils.wrap_expr(_rbexpr.str_hex_decode(strict))
626
+ elsif encoding == "base64"
627
+ Utils.wrap_expr(_rbexpr.str_base64_decode(strict))
628
+ else
629
+ raise ArgumentError, "encoding must be one of {{'hex', 'base64'}}, got #{encoding}"
630
+ end
631
+ end
632
+
633
+ # Encode a value using the provided encoding.
634
+ #
635
+ # @param encoding ["hex", "base64"]
636
+ # The encoding to use.
637
+ #
638
+ # @return [Expr]
639
+ #
640
+ # @example
641
+ # df = Polars::DataFrame.new({"strings" => ["foo", "bar", nil]})
642
+ # df.select(Polars.col("strings").str.encode("hex"))
643
+ # # =>
644
+ # # shape: (3, 1)
645
+ # # ┌─────────┐
646
+ # # │ strings │
647
+ # # │ --- │
648
+ # # │ str │
649
+ # # ╞═════════╡
650
+ # # │ 666f6f │
651
+ # # ├╌╌╌╌╌╌╌╌╌┤
652
+ # # │ 626172 │
653
+ # # ├╌╌╌╌╌╌╌╌╌┤
654
+ # # │ null │
655
+ # # └─────────┘
656
+ def encode(encoding)
657
+ if encoding == "hex"
658
+ Utils.wrap_expr(_rbexpr.str_hex_encode)
659
+ elsif encoding == "base64"
660
+ Utils.wrap_expr(_rbexpr.str_base64_encode)
661
+ else
662
+ raise ArgumentError, "encoding must be one of {{'hex', 'base64'}}, got #{encoding}"
663
+ end
664
+ end
665
+
666
+ # Extract the target capture group from provided patterns.
667
+ #
668
+ # @param pattern [String]
669
+ # A valid regex pattern
670
+ # @param group_index [Integer]
671
+ # Index of the targeted capture group.
672
+ # Group 0 mean the whole pattern, first group begin at index 1
673
+ # Default to the first capture group
674
+ #
675
+ # @return [Expr]
676
+ #
677
+ # @example
678
+ # df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
679
+ # df.select(
680
+ # [
681
+ # Polars.col("foo").str.extract('(\d+)')
682
+ # ]
683
+ # )
684
+ # # =>
685
+ # # shape: (2, 1)
686
+ # # ┌─────┐
687
+ # # │ foo │
688
+ # # │ --- │
689
+ # # │ str │
690
+ # # ╞═════╡
691
+ # # │ 123 │
692
+ # # ├╌╌╌╌╌┤
693
+ # # │ 678 │
694
+ # # └─────┘
695
+ def extract(pattern, group_index: 1)
696
+ Utils.wrap_expr(_rbexpr.str_extract(pattern, group_index))
697
+ end
698
+
699
+ # Extracts all matches for the given regex pattern.
700
+ #
701
+ # Extracts each successive non-overlapping regex match in an individual string as
702
+ # an array.
703
+ #
704
+ # @param pattern [String]
705
+ # A valid regex pattern
706
+ #
707
+ # @return [Expr]
708
+ #
709
+ # @example
710
+ # df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
711
+ # df.select(
712
+ # [
713
+ # Polars.col("foo").str.extract_all('(\d+)').alias("extracted_nrs")
714
+ # ]
715
+ # )
716
+ # # =>
717
+ # # shape: (2, 1)
718
+ # # ┌────────────────┐
719
+ # # │ extracted_nrs │
720
+ # # │ --- │
721
+ # # │ list[str] │
722
+ # # ╞════════════════╡
723
+ # # │ ["123", "45"] │
724
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
725
+ # # │ ["678", "910"] │
726
+ # # └────────────────┘
727
+ def extract_all(pattern)
728
+ pattern = Utils.expr_to_lit_or_expr(pattern, str_to_lit: true)
729
+ Utils.wrap_expr(_rbexpr.str_extract_all(pattern._rbexpr))
730
+ end
731
+
732
+ # Count all successive non-overlapping regex matches.
733
+ #
734
+ # @param pattern [String]
735
+ # A valid regex pattern
736
+ #
737
+ # @return [Expr]
738
+ #
739
+ # @example
740
+ # df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
741
+ # df.select(
742
+ # [
743
+ # Polars.col("foo").str.count_match('\d').alias("count_digits")
744
+ # ]
745
+ # )
746
+ # # =>
747
+ # # shape: (2, 1)
748
+ # # ┌──────────────┐
749
+ # # │ count_digits │
750
+ # # │ --- │
751
+ # # │ u32 │
752
+ # # ╞══════════════╡
753
+ # # │ 5 │
754
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
755
+ # # │ 6 │
756
+ # # └──────────────┘
757
+ def count_match(pattern)
758
+ Utils.wrap_expr(_rbexpr.count_match(pattern))
759
+ end
760
+
761
+ # Split the string by a substring.
762
+ #
763
+ # @param by [String]
764
+ # Substring to split by.
765
+ # @param inclusive [Boolean]
766
+ # If true, include the split character/string in the results.
767
+ #
768
+ # @return [Expr]
769
+ #
770
+ # @example
771
+ # df = Polars::DataFrame.new({"s" => ["foo bar", "foo-bar", "foo bar baz"]})
772
+ # df.select(Polars.col("s").str.split(" "))
773
+ # # =>
774
+ # # shape: (3, 1)
775
+ # # ┌───────────────────────┐
776
+ # # │ s │
777
+ # # │ --- │
778
+ # # │ list[str] │
779
+ # # ╞═══════════════════════╡
780
+ # # │ ["foo", "bar"] │
781
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
782
+ # # │ ["foo-bar"] │
783
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
784
+ # # │ ["foo", "bar", "baz"] │
785
+ # # └───────────────────────┘
786
+ def split(by, inclusive: false)
787
+ if inclusive
788
+ Utils.wrap_expr(_rbexpr.str_split_inclusive(by))
789
+ else
790
+ Utils.wrap_expr(_rbexpr.str_split(by))
791
+ end
792
+ end
793
+
794
+ # Split the string by a substring using `n` splits.
795
+ #
796
+ # Results in a struct of `n+1` fields.
797
+ #
798
+ # If it cannot make `n` splits, the remaining field elements will be null.
799
+ #
800
+ # @param by [String]
801
+ # Substring to split by.
802
+ # @param n [Integer]
803
+ # Number of splits to make.
804
+ # @param inclusive [Boolean]
805
+ # If true, include the split character/string in the results.
806
+ #
807
+ # @return [Expr]
808
+ #
809
+ # @example
810
+ # df = Polars::DataFrame.new({"x" => ["a_1", nil, "c", "d_4"]})
811
+ # df.select(
812
+ # [
813
+ # Polars.col("x").str.split_exact("_", 1).alias("fields")
814
+ # ]
815
+ # )
816
+ # # =>
817
+ # # shape: (4, 1)
818
+ # # ┌─────────────┐
819
+ # # │ fields │
820
+ # # │ --- │
821
+ # # │ struct[2] │
822
+ # # ╞═════════════╡
823
+ # # │ {"a","1"} │
824
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┤
825
+ # # │ {null,null} │
826
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┤
827
+ # # │ {"c",null} │
828
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┤
829
+ # # │ {"d","4"} │
830
+ # # └─────────────┘
831
+ def split_exact(by, n, inclusive: false)
832
+ if inclusive
833
+ Utils.wrap_expr(_rbexpr.str_split_exact_inclusive(by, n))
834
+ else
835
+ Utils.wrap_expr(_rbexpr.str_split_exact(by, n))
836
+ end
837
+ end
838
+
839
+ # Split the string by a substring, restricted to returning at most `n` items.
840
+ #
841
+ # If the number of possible splits is less than `n-1`, the remaining field
842
+ # elements will be null. If the number of possible splits is `n-1` or greater,
843
+ # the last (nth) substring will contain the remainder of the string.
844
+ #
845
+ # @param by [String]
846
+ # Substring to split by.
847
+ # @param n [Integer]
848
+ # Max number of items to return.
849
+ #
850
+ # @return [Expr]
851
+ #
852
+ # @example
853
+ # df = Polars::DataFrame.new({"s" => ["foo bar", nil, "foo-bar", "foo bar baz"]})
854
+ # df.select(Polars.col("s").str.splitn(" ", 2).alias("fields"))
855
+ # # =>
856
+ # # shape: (4, 1)
857
+ # # ┌───────────────────┐
858
+ # # │ fields │
859
+ # # │ --- │
860
+ # # │ struct[2] │
861
+ # # ╞═══════════════════╡
862
+ # # │ {"foo","bar"} │
863
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
864
+ # # │ {null,null} │
865
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
866
+ # # │ {"foo-bar",null} │
867
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
868
+ # # │ {"foo","bar baz"} │
869
+ # # └───────────────────┘
870
+ def splitn(by, n)
871
+ Utils.wrap_expr(_rbexpr.str_splitn(by, n))
872
+ end
873
+
874
+ # Replace first matching regex/literal substring with a new string value.
875
+ #
876
+ # @param pattern [String]
877
+ # Regex pattern.
878
+ # @param value [String]
879
+ # Replacement string.
880
+ # @param literal [Boolean]
881
+ # Treat pattern as a literal string.
882
+ #
883
+ # @return [Expr]
884
+ #
885
+ # @example
886
+ # df = Polars::DataFrame.new({"id" => [1, 2], "text" => ["123abc", "abc456"]})
887
+ # df.with_column(
888
+ # Polars.col("text").str.replace('abc\b', "ABC")
889
+ # )
890
+ # # =>
891
+ # # shape: (2, 2)
892
+ # # ┌─────┬────────┐
893
+ # # │ id ┆ text │
894
+ # # │ --- ┆ --- │
895
+ # # │ i64 ┆ str │
896
+ # # ╞═════╪════════╡
897
+ # # │ 1 ┆ 123ABC │
898
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
899
+ # # │ 2 ┆ abc456 │
900
+ # # └─────┴────────┘
901
+ def replace(pattern, value, literal: false)
902
+ pattern = Utils.expr_to_lit_or_expr(pattern, str_to_lit: true)
903
+ value = Utils.expr_to_lit_or_expr(value, str_to_lit: true)
904
+ Utils.wrap_expr(_rbexpr.str_replace(pattern._rbexpr, value._rbexpr, literal))
905
+ end
906
+
907
+ # Replace all matching regex/literal substrings with a new string value.
908
+ #
909
+ # @param pattern [String]
910
+ # Regex pattern.
911
+ # @param value [String]
912
+ # Replacement string.
913
+ # @param literal [Boolean]
914
+ # Treat pattern as a literal string.
915
+ #
916
+ # @return [Expr]
917
+ #
918
+ # @example
919
+ # df = Polars::DataFrame.new({"id" => [1, 2], "text" => ["abcabc", "123a123"]})
920
+ # df.with_column(Polars.col("text").str.replace_all("a", "-"))
921
+ # # =>
922
+ # # shape: (2, 2)
923
+ # # ┌─────┬─────────┐
924
+ # # │ id ┆ text │
925
+ # # │ --- ┆ --- │
926
+ # # │ i64 ┆ str │
927
+ # # ╞═════╪═════════╡
928
+ # # │ 1 ┆ -bc-bc │
929
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
930
+ # # │ 2 ┆ 123-123 │
931
+ # # └─────┴─────────┘
932
+ def replace_all(pattern, value, literal: false)
933
+ pattern = Utils.expr_to_lit_or_expr(pattern, str_to_lit: true)
934
+ value = Utils.expr_to_lit_or_expr(value, str_to_lit: true)
935
+ Utils.wrap_expr(_rbexpr.str_replace_all(pattern._rbexpr, value._rbexpr, literal))
936
+ end
937
+
938
+ # Create subslices of the string values of a Utf8 Series.
939
+ #
940
+ # @param offset [Integer]
941
+ # Start index. Negative indexing is supported.
942
+ # @param length [Integer]
943
+ # Length of the slice. If set to `nil` (default), the slice is taken to the
944
+ # end of the string.
945
+ #
946
+ # @return [Expr]
947
+ #
948
+ # @example
949
+ # df = Polars::DataFrame.new({"s" => ["pear", nil, "papaya", "dragonfruit"]})
950
+ # df.with_column(
951
+ # Polars.col("s").str.slice(-3).alias("s_sliced")
952
+ # )
953
+ # # =>
954
+ # # shape: (4, 2)
955
+ # # ┌─────────────┬──────────┐
956
+ # # │ s ┆ s_sliced │
957
+ # # │ --- ┆ --- │
958
+ # # │ str ┆ str │
959
+ # # ╞═════════════╪══════════╡
960
+ # # │ pear ┆ ear │
961
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
962
+ # # │ null ┆ null │
963
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
964
+ # # │ papaya ┆ aya │
965
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
966
+ # # │ dragonfruit ┆ uit │
967
+ # # └─────────────┴──────────┘
968
+ def slice(offset, length = nil)
969
+ Utils.wrap_expr(_rbexpr.str_slice(offset, length))
970
+ end
971
+ end
972
+ end