polars-df 0.2.0-x86_64-linux

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +3 -0
  3. data/CHANGELOG.md +33 -0
  4. data/Cargo.lock +2230 -0
  5. data/Cargo.toml +10 -0
  6. data/LICENSE-THIRD-PARTY.txt +38828 -0
  7. data/LICENSE.txt +20 -0
  8. data/README.md +91 -0
  9. data/lib/polars/3.0/polars.so +0 -0
  10. data/lib/polars/3.1/polars.so +0 -0
  11. data/lib/polars/3.2/polars.so +0 -0
  12. data/lib/polars/batched_csv_reader.rb +96 -0
  13. data/lib/polars/cat_expr.rb +52 -0
  14. data/lib/polars/cat_name_space.rb +54 -0
  15. data/lib/polars/convert.rb +100 -0
  16. data/lib/polars/data_frame.rb +4833 -0
  17. data/lib/polars/data_types.rb +122 -0
  18. data/lib/polars/date_time_expr.rb +1418 -0
  19. data/lib/polars/date_time_name_space.rb +1484 -0
  20. data/lib/polars/dynamic_group_by.rb +52 -0
  21. data/lib/polars/exceptions.rb +20 -0
  22. data/lib/polars/expr.rb +5307 -0
  23. data/lib/polars/expr_dispatch.rb +22 -0
  24. data/lib/polars/functions.rb +453 -0
  25. data/lib/polars/group_by.rb +558 -0
  26. data/lib/polars/io.rb +814 -0
  27. data/lib/polars/lazy_frame.rb +2442 -0
  28. data/lib/polars/lazy_functions.rb +1195 -0
  29. data/lib/polars/lazy_group_by.rb +93 -0
  30. data/lib/polars/list_expr.rb +610 -0
  31. data/lib/polars/list_name_space.rb +346 -0
  32. data/lib/polars/meta_expr.rb +54 -0
  33. data/lib/polars/rolling_group_by.rb +35 -0
  34. data/lib/polars/series.rb +3730 -0
  35. data/lib/polars/slice.rb +104 -0
  36. data/lib/polars/string_expr.rb +972 -0
  37. data/lib/polars/string_name_space.rb +690 -0
  38. data/lib/polars/struct_expr.rb +100 -0
  39. data/lib/polars/struct_name_space.rb +64 -0
  40. data/lib/polars/utils.rb +192 -0
  41. data/lib/polars/version.rb +4 -0
  42. data/lib/polars/when.rb +16 -0
  43. data/lib/polars/when_then.rb +19 -0
  44. data/lib/polars-df.rb +1 -0
  45. data/lib/polars.rb +50 -0
  46. metadata +89 -0
@@ -0,0 +1,972 @@
1
+ module Polars
2
+ # Namespace for string related expressions.
3
+ class StringExpr
4
+ # @private
5
+ attr_accessor :_rbexpr
6
+
7
+ # @private
8
+ def initialize(expr)
9
+ self._rbexpr = expr._rbexpr
10
+ end
11
+
12
+ # Parse a Utf8 expression to a Date/Datetime/Time type.
13
+ #
14
+ # @param datatype [Symbol]
15
+ # `:date`, `:dateime`, or `:time`.
16
+ # @param fmt [String]
17
+ # Format to use, refer to the
18
+ # [chrono strftime documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
19
+ # for specification. Example: `"%y-%m-%d"`.
20
+ # @param strict [Boolean]
21
+ # Raise an error if any conversion fails.
22
+ # @param exact [Boolean]
23
+ # - If true, require an exact format match.
24
+ # - If false, allow the format to match anywhere in the target string.
25
+ #
26
+ # @return [Expr]
27
+ #
28
+ # @note
29
+ # When parsing a Datetime the column precision will be inferred from
30
+ # the format string, if given, eg: "%F %T%.3f" => Datetime("ms"). If
31
+ # no fractional second component is found then the default is "us".
32
+ #
33
+ # @example
34
+ # s = Polars::Series.new(
35
+ # "date",
36
+ # [
37
+ # "2021-04-22",
38
+ # "2022-01-04 00:00:00",
39
+ # "01/31/22",
40
+ # "Sun Jul 8 00:34:60 2001"
41
+ # ]
42
+ # )
43
+ # s.to_frame.with_column(
44
+ # Polars.col("date")
45
+ # .str.strptime(:date, "%F", strict: false)
46
+ # .fill_null(
47
+ # Polars.col("date").str.strptime(:date, "%F %T", strict: false)
48
+ # )
49
+ # .fill_null(Polars.col("date").str.strptime(:date, "%D", strict: false))
50
+ # .fill_null(Polars.col("date").str.strptime(:date, "%c", strict: false))
51
+ # )
52
+ # # =>
53
+ # # shape: (4, 1)
54
+ # # ┌────────────┐
55
+ # # │ date │
56
+ # # │ --- │
57
+ # # │ date │
58
+ # # ╞════════════╡
59
+ # # │ 2021-04-22 │
60
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┤
61
+ # # │ 2022-01-04 │
62
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┤
63
+ # # │ 2022-01-31 │
64
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┤
65
+ # # │ 2001-07-08 │
66
+ # # └────────────┘
67
+ def strptime(datatype, fmt = nil, strict: true, exact: true, cache: true, tz_aware: false)
68
+ if !Utils.is_polars_dtype(datatype)
69
+ raise ArgumentError, "expected: {DataType} got: #{datatype}"
70
+ end
71
+
72
+ if datatype == :date
73
+ Utils.wrap_expr(_rbexpr.str_parse_date(fmt, strict, exact, cache))
74
+ elsif datatype == :datetime
75
+ # TODO fix
76
+ tu = nil # datatype.tu
77
+ dtcol = Utils.wrap_expr(_rbexpr.str_parse_datetime(fmt, strict, exact, cache, tz_aware))
78
+ if tu.nil?
79
+ dtcol
80
+ else
81
+ dtcol.dt.cast_time_unit(tu)
82
+ end
83
+ elsif datatype == :time
84
+ Utils.wrap_expr(_rbexpr.str_parse_time(fmt, strict, exact, cache))
85
+ else
86
+ raise ArgumentError, "dtype should be of type :date, :datetime, or :time"
87
+ end
88
+ end
89
+
90
+ # Get length of the strings as `:u32` (as number of bytes).
91
+ #
92
+ # @return [Expr]
93
+ #
94
+ # @note
95
+ # The returned lengths are equal to the number of bytes in the UTF8 string. If you
96
+ # need the length in terms of the number of characters, use `n_chars` instead.
97
+ #
98
+ # @example
99
+ # df = Polars::DataFrame.new({"s" => ["Café", nil, "345", "東京"]}).with_columns(
100
+ # [
101
+ # Polars.col("s").str.lengths.alias("length"),
102
+ # Polars.col("s").str.n_chars.alias("nchars")
103
+ # ]
104
+ # )
105
+ # df
106
+ # # =>
107
+ # # shape: (4, 3)
108
+ # # ┌──────┬────────┬────────┐
109
+ # # │ s ┆ length ┆ nchars │
110
+ # # │ --- ┆ --- ┆ --- │
111
+ # # │ str ┆ u32 ┆ u32 │
112
+ # # ╞══════╪════════╪════════╡
113
+ # # │ Café ┆ 5 ┆ 4 │
114
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
115
+ # # │ null ┆ null ┆ null │
116
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
117
+ # # │ 345 ┆ 3 ┆ 3 │
118
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
119
+ # # │ 東京 ┆ 6 ┆ 2 │
120
+ # # └──────┴────────┴────────┘
121
+ def lengths
122
+ Utils.wrap_expr(_rbexpr.str_lengths)
123
+ end
124
+
125
+ # Get length of the strings as `:u32` (as number of chars).
126
+ #
127
+ # @return [Expr]
128
+ #
129
+ # @note
130
+ # If you know that you are working with ASCII text, `lengths` will be
131
+ # equivalent, and faster (returns length in terms of the number of bytes).
132
+ #
133
+ # @example
134
+ # df = Polars::DataFrame.new({"s" => ["Café", nil, "345", "東京"]}).with_columns(
135
+ # [
136
+ # Polars.col("s").str.lengths.alias("length"),
137
+ # Polars.col("s").str.n_chars.alias("nchars")
138
+ # ]
139
+ # )
140
+ # df
141
+ # # =>
142
+ # # shape: (4, 3)
143
+ # # ┌──────┬────────┬────────┐
144
+ # # │ s ┆ length ┆ nchars │
145
+ # # │ --- ┆ --- ┆ --- │
146
+ # # │ str ┆ u32 ┆ u32 │
147
+ # # ╞══════╪════════╪════════╡
148
+ # # │ Café ┆ 5 ┆ 4 │
149
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
150
+ # # │ null ┆ null ┆ null │
151
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
152
+ # # │ 345 ┆ 3 ┆ 3 │
153
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
154
+ # # │ 東京 ┆ 6 ┆ 2 │
155
+ # # └──────┴────────┴────────┘
156
+ def n_chars
157
+ Utils.wrap_expr(_rbexpr.str_n_chars)
158
+ end
159
+
160
+ # Vertically concat the values in the Series to a single string value.
161
+ #
162
+ # @param delimiter [String]
163
+ # The delimiter to insert between consecutive string values.
164
+ #
165
+ # @return [Expr]
166
+ #
167
+ # @example
168
+ # df = Polars::DataFrame.new({"foo" => [1, nil, 2]})
169
+ # df.select(Polars.col("foo").str.concat("-"))
170
+ # # =>
171
+ # # shape: (1, 1)
172
+ # # ┌──────────┐
173
+ # # │ foo │
174
+ # # │ --- │
175
+ # # │ str │
176
+ # # ╞══════════╡
177
+ # # │ 1-null-2 │
178
+ # # └──────────┘
179
+ def concat(delimiter = "-")
180
+ Utils.wrap_expr(_rbexpr.str_concat(delimiter))
181
+ end
182
+
183
+ # Transform to uppercase variant.
184
+ #
185
+ # @return [Expr]
186
+ #
187
+ # @example
188
+ # df = Polars::DataFrame.new({"foo" => ["cat", "dog"]})
189
+ # df.select(Polars.col("foo").str.to_uppercase)
190
+ # # =>
191
+ # # shape: (2, 1)
192
+ # # ┌─────┐
193
+ # # │ foo │
194
+ # # │ --- │
195
+ # # │ str │
196
+ # # ╞═════╡
197
+ # # │ CAT │
198
+ # # ├╌╌╌╌╌┤
199
+ # # │ DOG │
200
+ # # └─────┘
201
+ def to_uppercase
202
+ Utils.wrap_expr(_rbexpr.str_to_uppercase)
203
+ end
204
+
205
+ # Transform to lowercase variant.
206
+ #
207
+ # @return [Expr]
208
+ #
209
+ # @example
210
+ # df = Polars::DataFrame.new({"foo" => ["CAT", "DOG"]})
211
+ # df.select(Polars.col("foo").str.to_lowercase)
212
+ # # =>
213
+ # # shape: (2, 1)
214
+ # # ┌─────┐
215
+ # # │ foo │
216
+ # # │ --- │
217
+ # # │ str │
218
+ # # ╞═════╡
219
+ # # │ cat │
220
+ # # ├╌╌╌╌╌┤
221
+ # # │ dog │
222
+ # # └─────┘
223
+ def to_lowercase
224
+ Utils.wrap_expr(_rbexpr.str_to_lowercase)
225
+ end
226
+
227
+ # Remove leading and trailing whitespace.
228
+ #
229
+ # @param matches [String, nil]
230
+ # An optional single character that should be trimmed.
231
+ #
232
+ # @return [Expr]
233
+ #
234
+ # @example
235
+ # df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
236
+ # df.select(Polars.col("foo").str.strip)
237
+ # # =>
238
+ # # shape: (3, 1)
239
+ # # ┌───────┐
240
+ # # │ foo │
241
+ # # │ --- │
242
+ # # │ str │
243
+ # # ╞═══════╡
244
+ # # │ lead │
245
+ # # ├╌╌╌╌╌╌╌┤
246
+ # # │ trail │
247
+ # # ├╌╌╌╌╌╌╌┤
248
+ # # │ both │
249
+ # # └───────┘
250
+ def strip(matches = nil)
251
+ if !matches.nil? && matches.length > 1
252
+ raise ArgumentError, "matches should contain a single character"
253
+ end
254
+ Utils.wrap_expr(_rbexpr.str_strip(matches))
255
+ end
256
+
257
+ # Remove leading whitespace.
258
+ #
259
+ # @param matches [String, nil]
260
+ # An optional single character that should be trimmed.
261
+ #
262
+ # @return [Expr]
263
+ #
264
+ # @example
265
+ # df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
266
+ # df.select(Polars.col("foo").str.lstrip)
267
+ # # =>
268
+ # # shape: (3, 1)
269
+ # # ┌────────┐
270
+ # # │ foo │
271
+ # # │ --- │
272
+ # # │ str │
273
+ # # ╞════════╡
274
+ # # │ lead │
275
+ # # ├╌╌╌╌╌╌╌╌┤
276
+ # # │ trail │
277
+ # # ├╌╌╌╌╌╌╌╌┤
278
+ # # │ both │
279
+ # # └────────┘
280
+ def lstrip(matches = nil)
281
+ if !matches.nil? && matches.length > 1
282
+ raise ArgumentError, "matches should contain a single character"
283
+ end
284
+ Utils.wrap_expr(_rbexpr.str_lstrip(matches))
285
+ end
286
+
287
+ # Remove trailing whitespace.
288
+ #
289
+ # @param matches [String, nil]
290
+ # An optional single character that should be trimmed.
291
+ #
292
+ # @return [Expr]
293
+ #
294
+ # @example
295
+ # df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
296
+ # df.select(Polars.col("foo").str.rstrip)
297
+ # # =>
298
+ # # shape: (3, 1)
299
+ # # ┌───────┐
300
+ # # │ foo │
301
+ # # │ --- │
302
+ # # │ str │
303
+ # # ╞═══════╡
304
+ # # │ lead │
305
+ # # ├╌╌╌╌╌╌╌┤
306
+ # # │ trail │
307
+ # # ├╌╌╌╌╌╌╌┤
308
+ # # │ both │
309
+ # # └───────┘
310
+ def rstrip(matches = nil)
311
+ if !matches.nil? && matches.length > 1
312
+ raise ArgumentError, "matches should contain a single character"
313
+ end
314
+ Utils.wrap_expr(_rbexpr.str_rstrip(matches))
315
+ end
316
+
317
+ # Fills the string with zeroes.
318
+ #
319
+ # Return a copy of the string left filled with ASCII '0' digits to make a string
320
+ # of length width.
321
+ #
322
+ # A leading sign prefix ('+'/'-') is handled by inserting the padding after the
323
+ # sign character rather than before. The original string is returned if width is
324
+ # less than or equal to `s.length`.
325
+ #
326
+ # @param alignment [Integer]
327
+ # Fill the value up to this length
328
+ #
329
+ # @return [Expr]
330
+ #
331
+ # @example
332
+ # df = Polars::DataFrame.new(
333
+ # {
334
+ # "num" => [-10, -1, 0, 1, 10, 100, 1000, 10000, 100000, 1000000, nil]
335
+ # }
336
+ # )
337
+ # df.with_column(Polars.col("num").cast(String).str.zfill(5))
338
+ # # =>
339
+ # # shape: (11, 1)
340
+ # # ┌─────────┐
341
+ # # │ num │
342
+ # # │ --- │
343
+ # # │ str │
344
+ # # ╞═════════╡
345
+ # # │ -0010 │
346
+ # # ├╌╌╌╌╌╌╌╌╌┤
347
+ # # │ -0001 │
348
+ # # ├╌╌╌╌╌╌╌╌╌┤
349
+ # # │ 00000 │
350
+ # # ├╌╌╌╌╌╌╌╌╌┤
351
+ # # │ 00001 │
352
+ # # ├╌╌╌╌╌╌╌╌╌┤
353
+ # # │ ... │
354
+ # # ├╌╌╌╌╌╌╌╌╌┤
355
+ # # │ 10000 │
356
+ # # ├╌╌╌╌╌╌╌╌╌┤
357
+ # # │ 100000 │
358
+ # # ├╌╌╌╌╌╌╌╌╌┤
359
+ # # │ 1000000 │
360
+ # # ├╌╌╌╌╌╌╌╌╌┤
361
+ # # │ null │
362
+ # # └─────────┘
363
+ def zfill(alignment)
364
+ Utils.wrap_expr(_rbexpr.str_zfill(alignment))
365
+ end
366
+
367
+ # Return the string left justified in a string of length `width`.
368
+ #
369
+ # Padding is done using the specified `fillchar`.
370
+ # The original string is returned if `width` is less than or equal to
371
+ # `s.length`.
372
+ #
373
+ # @param width [Integer]
374
+ # Justify left to this length.
375
+ # @param fillchar [String]
376
+ # Fill with this ASCII character.
377
+ #
378
+ # @return [Expr]
379
+ #
380
+ # @example
381
+ # df = Polars::DataFrame.new({"a" => ["cow", "monkey", nil, "hippopotamus"]})
382
+ # df.select(Polars.col("a").str.ljust(8, "*"))
383
+ # # =>
384
+ # # shape: (4, 1)
385
+ # # ┌──────────────┐
386
+ # # │ a │
387
+ # # │ --- │
388
+ # # │ str │
389
+ # # ╞══════════════╡
390
+ # # │ cow***** │
391
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
392
+ # # │ monkey** │
393
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
394
+ # # │ null │
395
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
396
+ # # │ hippopotamus │
397
+ # # └──────────────┘
398
+ def ljust(width, fillchar = " ")
399
+ Utils.wrap_expr(_rbexpr.str_ljust(width, fillchar))
400
+ end
401
+
402
+ # Return the string right justified in a string of length `width`.
403
+ #
404
+ # Padding is done using the specified `fillchar`.
405
+ # The original string is returned if `width` is less than or equal to
406
+ # `s.length`.
407
+ #
408
+ # @param width [Integer]
409
+ # Justify right to this length.
410
+ # @param fillchar [String]
411
+ # Fill with this ASCII character.
412
+ #
413
+ # @return [Expr]
414
+ #
415
+ # @example
416
+ # df = Polars::DataFrame.new({"a" => ["cow", "monkey", nil, "hippopotamus"]})
417
+ # df.select(Polars.col("a").str.rjust(8, "*"))
418
+ # # =>
419
+ # # shape: (4, 1)
420
+ # # ┌──────────────┐
421
+ # # │ a │
422
+ # # │ --- │
423
+ # # │ str │
424
+ # # ╞══════════════╡
425
+ # # │ *****cow │
426
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
427
+ # # │ **monkey │
428
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
429
+ # # │ null │
430
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
431
+ # # │ hippopotamus │
432
+ # # └──────────────┘
433
+ def rjust(width, fillchar = " ")
434
+ Utils.wrap_expr(_rbexpr.str_rjust(width, fillchar))
435
+ end
436
+
437
+ # Check if string contains a substring that matches a regex.
438
+ #
439
+ # @param pattern [String]
440
+ # A valid regex pattern.
441
+ # @param literal [Boolean]
442
+ # Treat pattern as a literal string.
443
+ #
444
+ # @return [Expr]
445
+ #
446
+ # @example
447
+ # df = Polars::DataFrame.new({"a" => ["Crab", "cat and dog", "rab$bit", nil]})
448
+ # df.select(
449
+ # [
450
+ # Polars.col("a"),
451
+ # Polars.col("a").str.contains("cat|bit").alias("regex"),
452
+ # Polars.col("a").str.contains("rab$", literal: true).alias("literal")
453
+ # ]
454
+ # )
455
+ # # =>
456
+ # # shape: (4, 3)
457
+ # # ┌─────────────┬───────┬─────────┐
458
+ # # │ a ┆ regex ┆ literal │
459
+ # # │ --- ┆ --- ┆ --- │
460
+ # # │ str ┆ bool ┆ bool │
461
+ # # ╞═════════════╪═══════╪═════════╡
462
+ # # │ Crab ┆ false ┆ false │
463
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
464
+ # # │ cat and dog ┆ true ┆ false │
465
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
466
+ # # │ rab$bit ┆ true ┆ true │
467
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
468
+ # # │ null ┆ null ┆ null │
469
+ # # └─────────────┴───────┴─────────┘
470
+ def contains(pattern, literal: false)
471
+ Utils.wrap_expr(_rbexpr.str_contains(pattern, literal))
472
+ end
473
+
474
+ # Check if string values end with a substring.
475
+ #
476
+ # @param sub [String]
477
+ # Suffix substring.
478
+ #
479
+ # @return [Expr]
480
+ #
481
+ # @example
482
+ # df = Polars::DataFrame.new({"fruits" => ["apple", "mango", nil]})
483
+ # df.with_column(
484
+ # Polars.col("fruits").str.ends_with("go").alias("has_suffix")
485
+ # )
486
+ # # =>
487
+ # # shape: (3, 2)
488
+ # # ┌────────┬────────────┐
489
+ # # │ fruits ┆ has_suffix │
490
+ # # │ --- ┆ --- │
491
+ # # │ str ┆ bool │
492
+ # # ╞════════╪════════════╡
493
+ # # │ apple ┆ false │
494
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
495
+ # # │ mango ┆ true │
496
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
497
+ # # │ null ┆ null │
498
+ # # └────────┴────────────┘
499
+ #
500
+ # @example Using `ends_with` as a filter condition:
501
+ # df.filter(Polars.col("fruits").str.ends_with("go"))
502
+ # # =>
503
+ # # shape: (1, 1)
504
+ # # ┌────────┐
505
+ # # │ fruits │
506
+ # # │ --- │
507
+ # # │ str │
508
+ # # ╞════════╡
509
+ # # │ mango │
510
+ # # └────────┘
511
+ def ends_with(sub)
512
+ Utils.wrap_expr(_rbexpr.str_ends_with(sub))
513
+ end
514
+
515
+ # Check if string values start with a substring.
516
+ #
517
+ # @param sub [String]
518
+ # Prefix substring.
519
+ #
520
+ # @return [Expr]
521
+ #
522
+ # @example
523
+ # df = Polars::DataFrame.new({"fruits" => ["apple", "mango", nil]})
524
+ # df.with_column(
525
+ # Polars.col("fruits").str.starts_with("app").alias("has_prefix")
526
+ # )
527
+ # # =>
528
+ # # shape: (3, 2)
529
+ # # ┌────────┬────────────┐
530
+ # # │ fruits ┆ has_prefix │
531
+ # # │ --- ┆ --- │
532
+ # # │ str ┆ bool │
533
+ # # ╞════════╪════════════╡
534
+ # # │ apple ┆ true │
535
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
536
+ # # │ mango ┆ false │
537
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
538
+ # # │ null ┆ null │
539
+ # # └────────┴────────────┘
540
+ #
541
+ # @example Using `starts_with` as a filter condition:
542
+ # df.filter(Polars.col("fruits").str.starts_with("app"))
543
+ # # =>
544
+ # # shape: (1, 1)
545
+ # # ┌────────┐
546
+ # # │ fruits │
547
+ # # │ --- │
548
+ # # │ str │
549
+ # # ╞════════╡
550
+ # # │ apple │
551
+ # # └────────┘
552
+ def starts_with(sub)
553
+ Utils.wrap_expr(_rbexpr.str_starts_with(sub))
554
+ end
555
+
556
+ # Extract the first match of json string with provided JSONPath expression.
557
+ #
558
+ # Throw errors if encounter invalid json strings.
559
+ # All return value will be casted to Utf8 regardless of the original value.
560
+ #
561
+ # Documentation on JSONPath standard can be found
562
+ # [here](https://goessner.net/articles/JsonPath/).
563
+ #
564
+ # @param json_path [String]
565
+ # A valid JSON path query string.
566
+ #
567
+ # @return [Expr]
568
+ #
569
+ # @example
570
+ # df = Polars::DataFrame.new(
571
+ # {"json_val" => ['{"a":"1"}', nil, '{"a":2}', '{"a":2.1}', '{"a":true}']}
572
+ # )
573
+ # df.select(Polars.col("json_val").str.json_path_match("$.a"))
574
+ # # =>
575
+ # # shape: (5, 1)
576
+ # # ┌──────────┐
577
+ # # │ json_val │
578
+ # # │ --- │
579
+ # # │ str │
580
+ # # ╞══════════╡
581
+ # # │ 1 │
582
+ # # ├╌╌╌╌╌╌╌╌╌╌┤
583
+ # # │ null │
584
+ # # ├╌╌╌╌╌╌╌╌╌╌┤
585
+ # # │ 2 │
586
+ # # ├╌╌╌╌╌╌╌╌╌╌┤
587
+ # # │ 2.1 │
588
+ # # ├╌╌╌╌╌╌╌╌╌╌┤
589
+ # # │ true │
590
+ # # └──────────┘
591
+ def json_path_match(json_path)
592
+ Utils.wrap_expr(_rbexpr.str_json_path_match(json_path))
593
+ end
594
+
595
+ # Decode a value using the provided encoding.
596
+ #
597
+ # @param encoding ["hex", "base64"]
598
+ # The encoding to use.
599
+ # @param strict [Boolean]
600
+ # How to handle invalid inputs:
601
+ #
602
+ # - `true`: An error will be thrown if unable to decode a value.
603
+ # - `false`: Unhandled values will be replaced with `nil`.
604
+ #
605
+ # @return [Expr]
606
+ #
607
+ # @example
608
+ # df = Polars::DataFrame.new({"encoded" => ["666f6f", "626172", nil]})
609
+ # df.select(Polars.col("encoded").str.decode("hex"))
610
+ # # =>
611
+ # # shape: (3, 1)
612
+ # # ┌─────────┐
613
+ # # │ encoded │
614
+ # # │ --- │
615
+ # # │ str │
616
+ # # ╞═════════╡
617
+ # # │ foo │
618
+ # # ├╌╌╌╌╌╌╌╌╌┤
619
+ # # │ bar │
620
+ # # ├╌╌╌╌╌╌╌╌╌┤
621
+ # # │ null │
622
+ # # └─────────┘
623
+ def decode(encoding, strict: false)
624
+ if encoding == "hex"
625
+ Utils.wrap_expr(_rbexpr.str_hex_decode(strict))
626
+ elsif encoding == "base64"
627
+ Utils.wrap_expr(_rbexpr.str_base64_decode(strict))
628
+ else
629
+ raise ArgumentError, "encoding must be one of {{'hex', 'base64'}}, got #{encoding}"
630
+ end
631
+ end
632
+
633
+ # Encode a value using the provided encoding.
634
+ #
635
+ # @param encoding ["hex", "base64"]
636
+ # The encoding to use.
637
+ #
638
+ # @return [Expr]
639
+ #
640
+ # @example
641
+ # df = Polars::DataFrame.new({"strings" => ["foo", "bar", nil]})
642
+ # df.select(Polars.col("strings").str.encode("hex"))
643
+ # # =>
644
+ # # shape: (3, 1)
645
+ # # ┌─────────┐
646
+ # # │ strings │
647
+ # # │ --- │
648
+ # # │ str │
649
+ # # ╞═════════╡
650
+ # # │ 666f6f │
651
+ # # ├╌╌╌╌╌╌╌╌╌┤
652
+ # # │ 626172 │
653
+ # # ├╌╌╌╌╌╌╌╌╌┤
654
+ # # │ null │
655
+ # # └─────────┘
656
+ def encode(encoding)
657
+ if encoding == "hex"
658
+ Utils.wrap_expr(_rbexpr.str_hex_encode)
659
+ elsif encoding == "base64"
660
+ Utils.wrap_expr(_rbexpr.str_base64_encode)
661
+ else
662
+ raise ArgumentError, "encoding must be one of {{'hex', 'base64'}}, got #{encoding}"
663
+ end
664
+ end
665
+
666
+ # Extract the target capture group from provided patterns.
667
+ #
668
+ # @param pattern [String]
669
+ # A valid regex pattern
670
+ # @param group_index [Integer]
671
+ # Index of the targeted capture group.
672
+ # Group 0 mean the whole pattern, first group begin at index 1
673
+ # Default to the first capture group
674
+ #
675
+ # @return [Expr]
676
+ #
677
+ # @example
678
+ # df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
679
+ # df.select(
680
+ # [
681
+ # Polars.col("foo").str.extract('(\d+)')
682
+ # ]
683
+ # )
684
+ # # =>
685
+ # # shape: (2, 1)
686
+ # # ┌─────┐
687
+ # # │ foo │
688
+ # # │ --- │
689
+ # # │ str │
690
+ # # ╞═════╡
691
+ # # │ 123 │
692
+ # # ├╌╌╌╌╌┤
693
+ # # │ 678 │
694
+ # # └─────┘
695
+ def extract(pattern, group_index: 1)
696
+ Utils.wrap_expr(_rbexpr.str_extract(pattern, group_index))
697
+ end
698
+
699
+ # Extracts all matches for the given regex pattern.
700
+ #
701
+ # Extracts each successive non-overlapping regex match in an individual string as
702
+ # an array.
703
+ #
704
+ # @param pattern [String]
705
+ # A valid regex pattern
706
+ #
707
+ # @return [Expr]
708
+ #
709
+ # @example
710
+ # df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
711
+ # df.select(
712
+ # [
713
+ # Polars.col("foo").str.extract_all('(\d+)').alias("extracted_nrs")
714
+ # ]
715
+ # )
716
+ # # =>
717
+ # # shape: (2, 1)
718
+ # # ┌────────────────┐
719
+ # # │ extracted_nrs │
720
+ # # │ --- │
721
+ # # │ list[str] │
722
+ # # ╞════════════════╡
723
+ # # │ ["123", "45"] │
724
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
725
+ # # │ ["678", "910"] │
726
+ # # └────────────────┘
727
+ def extract_all(pattern)
728
+ pattern = Utils.expr_to_lit_or_expr(pattern, str_to_lit: true)
729
+ Utils.wrap_expr(_rbexpr.str_extract_all(pattern._rbexpr))
730
+ end
731
+
732
+ # Count all successive non-overlapping regex matches.
733
+ #
734
+ # @param pattern [String]
735
+ # A valid regex pattern
736
+ #
737
+ # @return [Expr]
738
+ #
739
+ # @example
740
+ # df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
741
+ # df.select(
742
+ # [
743
+ # Polars.col("foo").str.count_match('\d').alias("count_digits")
744
+ # ]
745
+ # )
746
+ # # =>
747
+ # # shape: (2, 1)
748
+ # # ┌──────────────┐
749
+ # # │ count_digits │
750
+ # # │ --- │
751
+ # # │ u32 │
752
+ # # ╞══════════════╡
753
+ # # │ 5 │
754
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
755
+ # # │ 6 │
756
+ # # └──────────────┘
757
+ def count_match(pattern)
758
+ Utils.wrap_expr(_rbexpr.count_match(pattern))
759
+ end
760
+
761
+ # Split the string by a substring.
762
+ #
763
+ # @param by [String]
764
+ # Substring to split by.
765
+ # @param inclusive [Boolean]
766
+ # If true, include the split character/string in the results.
767
+ #
768
+ # @return [Expr]
769
+ #
770
+ # @example
771
+ # df = Polars::DataFrame.new({"s" => ["foo bar", "foo-bar", "foo bar baz"]})
772
+ # df.select(Polars.col("s").str.split(" "))
773
+ # # =>
774
+ # # shape: (3, 1)
775
+ # # ┌───────────────────────┐
776
+ # # │ s │
777
+ # # │ --- │
778
+ # # │ list[str] │
779
+ # # ╞═══════════════════════╡
780
+ # # │ ["foo", "bar"] │
781
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
782
+ # # │ ["foo-bar"] │
783
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
784
+ # # │ ["foo", "bar", "baz"] │
785
+ # # └───────────────────────┘
786
+ def split(by, inclusive: false)
787
+ if inclusive
788
+ Utils.wrap_expr(_rbexpr.str_split_inclusive(by))
789
+ else
790
+ Utils.wrap_expr(_rbexpr.str_split(by))
791
+ end
792
+ end
793
+
794
+ # Split the string by a substring using `n` splits.
795
+ #
796
+ # Results in a struct of `n+1` fields.
797
+ #
798
+ # If it cannot make `n` splits, the remaining field elements will be null.
799
+ #
800
+ # @param by [String]
801
+ # Substring to split by.
802
+ # @param n [Integer]
803
+ # Number of splits to make.
804
+ # @param inclusive [Boolean]
805
+ # If true, include the split character/string in the results.
806
+ #
807
+ # @return [Expr]
808
+ #
809
+ # @example
810
+ # df = Polars::DataFrame.new({"x" => ["a_1", nil, "c", "d_4"]})
811
+ # df.select(
812
+ # [
813
+ # Polars.col("x").str.split_exact("_", 1).alias("fields")
814
+ # ]
815
+ # )
816
+ # # =>
817
+ # # shape: (4, 1)
818
+ # # ┌─────────────┐
819
+ # # │ fields │
820
+ # # │ --- │
821
+ # # │ struct[2] │
822
+ # # ╞═════════════╡
823
+ # # │ {"a","1"} │
824
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┤
825
+ # # │ {null,null} │
826
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┤
827
+ # # │ {"c",null} │
828
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┤
829
+ # # │ {"d","4"} │
830
+ # # └─────────────┘
831
+ def split_exact(by, n, inclusive: false)
832
+ if inclusive
833
+ Utils.wrap_expr(_rbexpr.str_split_exact_inclusive(by, n))
834
+ else
835
+ Utils.wrap_expr(_rbexpr.str_split_exact(by, n))
836
+ end
837
+ end
838
+
839
+ # Split the string by a substring, restricted to returning at most `n` items.
840
+ #
841
+ # If the number of possible splits is less than `n-1`, the remaining field
842
+ # elements will be null. If the number of possible splits is `n-1` or greater,
843
+ # the last (nth) substring will contain the remainder of the string.
844
+ #
845
+ # @param by [String]
846
+ # Substring to split by.
847
+ # @param n [Integer]
848
+ # Max number of items to return.
849
+ #
850
+ # @return [Expr]
851
+ #
852
+ # @example
853
+ # df = Polars::DataFrame.new({"s" => ["foo bar", nil, "foo-bar", "foo bar baz"]})
854
+ # df.select(Polars.col("s").str.splitn(" ", 2).alias("fields"))
855
+ # # =>
856
+ # # shape: (4, 1)
857
+ # # ┌───────────────────┐
858
+ # # │ fields │
859
+ # # │ --- │
860
+ # # │ struct[2] │
861
+ # # ╞═══════════════════╡
862
+ # # │ {"foo","bar"} │
863
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
864
+ # # │ {null,null} │
865
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
866
+ # # │ {"foo-bar",null} │
867
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
868
+ # # │ {"foo","bar baz"} │
869
+ # # └───────────────────┘
870
+ def splitn(by, n)
871
+ Utils.wrap_expr(_rbexpr.str_splitn(by, n))
872
+ end
873
+
874
+ # Replace first matching regex/literal substring with a new string value.
875
+ #
876
+ # @param pattern [String]
877
+ # Regex pattern.
878
+ # @param value [String]
879
+ # Replacement string.
880
+ # @param literal [Boolean]
881
+ # Treat pattern as a literal string.
882
+ #
883
+ # @return [Expr]
884
+ #
885
+ # @example
886
+ # df = Polars::DataFrame.new({"id" => [1, 2], "text" => ["123abc", "abc456"]})
887
+ # df.with_column(
888
+ # Polars.col("text").str.replace('abc\b', "ABC")
889
+ # )
890
+ # # =>
891
+ # # shape: (2, 2)
892
+ # # ┌─────┬────────┐
893
+ # # │ id ┆ text │
894
+ # # │ --- ┆ --- │
895
+ # # │ i64 ┆ str │
896
+ # # ╞═════╪════════╡
897
+ # # │ 1 ┆ 123ABC │
898
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
899
+ # # │ 2 ┆ abc456 │
900
+ # # └─────┴────────┘
901
+ def replace(pattern, value, literal: false)
902
+ pattern = Utils.expr_to_lit_or_expr(pattern, str_to_lit: true)
903
+ value = Utils.expr_to_lit_or_expr(value, str_to_lit: true)
904
+ Utils.wrap_expr(_rbexpr.str_replace(pattern._rbexpr, value._rbexpr, literal))
905
+ end
906
+
907
+ # Replace all matching regex/literal substrings with a new string value.
908
+ #
909
+ # @param pattern [String]
910
+ # Regex pattern.
911
+ # @param value [String]
912
+ # Replacement string.
913
+ # @param literal [Boolean]
914
+ # Treat pattern as a literal string.
915
+ #
916
+ # @return [Expr]
917
+ #
918
+ # @example
919
+ # df = Polars::DataFrame.new({"id" => [1, 2], "text" => ["abcabc", "123a123"]})
920
+ # df.with_column(Polars.col("text").str.replace_all("a", "-"))
921
+ # # =>
922
+ # # shape: (2, 2)
923
+ # # ┌─────┬─────────┐
924
+ # # │ id ┆ text │
925
+ # # │ --- ┆ --- │
926
+ # # │ i64 ┆ str │
927
+ # # ╞═════╪═════════╡
928
+ # # │ 1 ┆ -bc-bc │
929
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
930
+ # # │ 2 ┆ 123-123 │
931
+ # # └─────┴─────────┘
932
+ def replace_all(pattern, value, literal: false)
933
+ pattern = Utils.expr_to_lit_or_expr(pattern, str_to_lit: true)
934
+ value = Utils.expr_to_lit_or_expr(value, str_to_lit: true)
935
+ Utils.wrap_expr(_rbexpr.str_replace_all(pattern._rbexpr, value._rbexpr, literal))
936
+ end
937
+
938
+ # Create subslices of the string values of a Utf8 Series.
939
+ #
940
+ # @param offset [Integer]
941
+ # Start index. Negative indexing is supported.
942
+ # @param length [Integer]
943
+ # Length of the slice. If set to `nil` (default), the slice is taken to the
944
+ # end of the string.
945
+ #
946
+ # @return [Expr]
947
+ #
948
+ # @example
949
+ # df = Polars::DataFrame.new({"s" => ["pear", nil, "papaya", "dragonfruit"]})
950
+ # df.with_column(
951
+ # Polars.col("s").str.slice(-3).alias("s_sliced")
952
+ # )
953
+ # # =>
954
+ # # shape: (4, 2)
955
+ # # ┌─────────────┬──────────┐
956
+ # # │ s ┆ s_sliced │
957
+ # # │ --- ┆ --- │
958
+ # # │ str ┆ str │
959
+ # # ╞═════════════╪══════════╡
960
+ # # │ pear ┆ ear │
961
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
962
+ # # │ null ┆ null │
963
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
964
+ # # │ papaya ┆ aya │
965
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
966
+ # # │ dragonfruit ┆ uit │
967
+ # # └─────────────┴──────────┘
968
+ def slice(offset, length = nil)
969
+ Utils.wrap_expr(_rbexpr.str_slice(offset, length))
970
+ end
971
+ end
972
+ end