polars-df 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +3 -0
  3. data/CHANGELOG.md +9 -0
  4. data/Cargo.lock +74 -3
  5. data/Cargo.toml +3 -0
  6. data/README.md +1 -1
  7. data/ext/polars/Cargo.toml +18 -1
  8. data/ext/polars/src/conversion.rs +115 -2
  9. data/ext/polars/src/dataframe.rs +228 -11
  10. data/ext/polars/src/error.rs +4 -0
  11. data/ext/polars/src/lazy/dataframe.rs +5 -5
  12. data/ext/polars/src/lazy/dsl.rs +157 -2
  13. data/ext/polars/src/lib.rs +185 -10
  14. data/ext/polars/src/list_construction.rs +100 -0
  15. data/ext/polars/src/series.rs +217 -29
  16. data/ext/polars/src/set.rs +91 -0
  17. data/ext/polars/src/utils.rs +19 -0
  18. data/lib/polars/batched_csv_reader.rb +1 -0
  19. data/lib/polars/cat_expr.rb +39 -0
  20. data/lib/polars/cat_name_space.rb +54 -0
  21. data/lib/polars/data_frame.rb +2384 -140
  22. data/lib/polars/date_time_expr.rb +1282 -7
  23. data/lib/polars/date_time_name_space.rb +1484 -0
  24. data/lib/polars/exceptions.rb +20 -0
  25. data/lib/polars/expr.rb +4374 -53
  26. data/lib/polars/expr_dispatch.rb +22 -0
  27. data/lib/polars/functions.rb +219 -0
  28. data/lib/polars/group_by.rb +518 -0
  29. data/lib/polars/io.rb +421 -2
  30. data/lib/polars/lazy_frame.rb +1267 -69
  31. data/lib/polars/lazy_functions.rb +412 -24
  32. data/lib/polars/lazy_group_by.rb +80 -0
  33. data/lib/polars/list_expr.rb +507 -5
  34. data/lib/polars/list_name_space.rb +346 -0
  35. data/lib/polars/meta_expr.rb +21 -0
  36. data/lib/polars/series.rb +2256 -242
  37. data/lib/polars/slice.rb +104 -0
  38. data/lib/polars/string_expr.rb +847 -10
  39. data/lib/polars/string_name_space.rb +690 -0
  40. data/lib/polars/struct_expr.rb +73 -0
  41. data/lib/polars/struct_name_space.rb +64 -0
  42. data/lib/polars/utils.rb +71 -3
  43. data/lib/polars/version.rb +2 -1
  44. data/lib/polars/when.rb +1 -0
  45. data/lib/polars/when_then.rb +1 -0
  46. data/lib/polars.rb +12 -10
  47. metadata +15 -2
@@ -1,34 +1,252 @@
1
1
  module Polars
2
+ # Namespace for string related expressions.
2
3
  class StringExpr
4
+ # @private
3
5
  attr_accessor :_rbexpr
4
6
 
7
+ # @private
5
8
  def initialize(expr)
6
9
  self._rbexpr = expr._rbexpr
7
10
  end
8
11
 
9
- # def strptime
10
- # end
12
+ # Parse a Utf8 expression to a Date/Datetime/Time type.
13
+ #
14
+ # @param datatype [Symbol]
15
+ # `:date`, `:dateime`, or `:time`.
16
+ # @param fmt [String]
17
+ # Format to use, refer to the
18
+ # [chrono strftime documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
19
+ # for specification. Example: `"%y-%m-%d"`.
20
+ # @param strict [Boolean]
21
+ # Raise an error if any conversion fails.
22
+ # @param exact [Boolean]
23
+ # - If true, require an exact format match.
24
+ # - If false, allow the format to match anywhere in the target string.
25
+ #
26
+ # @return [Expr]
27
+ #
28
+ # @note
29
+ # When parsing a Datetime the column precision will be inferred from
30
+ # the format string, if given, eg: "%F %T%.3f" => Datetime("ms"). If
31
+ # no fractional second component is found then the default is "us".
32
+ #
33
+ # @example
34
+ # s = Polars::Series.new(
35
+ # "date",
36
+ # [
37
+ # "2021-04-22",
38
+ # "2022-01-04 00:00:00",
39
+ # "01/31/22",
40
+ # "Sun Jul 8 00:34:60 2001"
41
+ # ]
42
+ # )
43
+ # s.to_frame.with_column(
44
+ # Polars.col("date")
45
+ # .str.strptime(:date, "%F", strict: false)
46
+ # .fill_null(
47
+ # Polars.col("date").str.strptime(:date, "%F %T", strict: false)
48
+ # )
49
+ # .fill_null(Polars.col("date").str.strptime(:date, "%D", strict: false))
50
+ # .fill_null(Polars.col("date").str.strptime(:date, "%c", strict: false))
51
+ # )
52
+ # # =>
53
+ # # shape: (4, 1)
54
+ # # ┌────────────┐
55
+ # # │ date │
56
+ # # │ --- │
57
+ # # │ date │
58
+ # # ╞════════════╡
59
+ # # │ 2021-04-22 │
60
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┤
61
+ # # │ 2022-01-04 │
62
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┤
63
+ # # │ 2022-01-31 │
64
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┤
65
+ # # │ 2001-07-08 │
66
+ # # └────────────┘
67
+ def strptime(datatype, fmt = nil, strict: true, exact: true)
68
+ if !Utils.is_polars_dtype(datatype)
69
+ raise ArgumentError, "expected: {DataType} got: #{datatype}"
70
+ end
71
+
72
+ if datatype == :date
73
+ Utils.wrap_expr(_rbexpr.str_parse_date(fmt, strict, exact))
74
+ elsif datatype == :datetime
75
+ # TODO fix
76
+ tu = nil # datatype.tu
77
+ dtcol = Utils.wrap_expr(_rbexpr.str_parse_datetime(fmt, strict, exact))
78
+ if tu.nil?
79
+ dtcol
80
+ else
81
+ dtcol.dt.cast_time_unit(tu)
82
+ end
83
+ elsif datatype == :time
84
+ Utils.wrap_expr(_rbexpr.str_parse_time(fmt, strict, exact))
85
+ else
86
+ raise ArgumentError, "dtype should be of type :date, :datetime, or :time"
87
+ end
88
+ end
11
89
 
90
+ # Get length of the strings as `:u32` (as number of bytes).
91
+ #
92
+ # @return [Expr]
93
+ #
94
+ # @note
95
+ # The returned lengths are equal to the number of bytes in the UTF8 string. If you
96
+ # need the length in terms of the number of characters, use `n_chars` instead.
97
+ #
98
+ # @example
99
+ # df = Polars::DataFrame.new({"s" => ["Café", nil, "345", "東京"]}).with_columns(
100
+ # [
101
+ # Polars.col("s").str.lengths.alias("length"),
102
+ # Polars.col("s").str.n_chars.alias("nchars")
103
+ # ]
104
+ # )
105
+ # df
106
+ # # =>
107
+ # # shape: (4, 3)
108
+ # # ┌──────┬────────┬────────┐
109
+ # # │ s ┆ length ┆ nchars │
110
+ # # │ --- ┆ --- ┆ --- │
111
+ # # │ str ┆ u32 ┆ u32 │
112
+ # # ╞══════╪════════╪════════╡
113
+ # # │ Café ┆ 5 ┆ 4 │
114
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
115
+ # # │ null ┆ null ┆ null │
116
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
117
+ # # │ 345 ┆ 3 ┆ 3 │
118
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
119
+ # # │ 東京 ┆ 6 ┆ 2 │
120
+ # # └──────┴────────┴────────┘
12
121
  def lengths
13
122
  Utils.wrap_expr(_rbexpr.str_lengths)
14
123
  end
15
124
 
125
+ # Get length of the strings as `:u32` (as number of chars).
126
+ #
127
+ # @return [Expr]
128
+ #
129
+ # @note
130
+ # If you know that you are working with ASCII text, `lengths` will be
131
+ # equivalent, and faster (returns length in terms of the number of bytes).
132
+ #
133
+ # @example
134
+ # df = Polars::DataFrame.new({"s" => ["Café", nil, "345", "東京"]}).with_columns(
135
+ # [
136
+ # Polars.col("s").str.lengths.alias("length"),
137
+ # Polars.col("s").str.n_chars.alias("nchars")
138
+ # ]
139
+ # )
140
+ # df
141
+ # # =>
142
+ # # shape: (4, 3)
143
+ # # ┌──────┬────────┬────────┐
144
+ # # │ s ┆ length ┆ nchars │
145
+ # # │ --- ┆ --- ┆ --- │
146
+ # # │ str ┆ u32 ┆ u32 │
147
+ # # ╞══════╪════════╪════════╡
148
+ # # │ Café ┆ 5 ┆ 4 │
149
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
150
+ # # │ null ┆ null ┆ null │
151
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
152
+ # # │ 345 ┆ 3 ┆ 3 │
153
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
154
+ # # │ 東京 ┆ 6 ┆ 2 │
155
+ # # └──────┴────────┴────────┘
16
156
  def n_chars
17
157
  Utils.wrap_expr(_rbexpr.str_n_chars)
18
158
  end
19
159
 
160
+ # Vertically concat the values in the Series to a single string value.
161
+ #
162
+ # @param delimiter [String]
163
+ # The delimiter to insert between consecutive string values.
164
+ #
165
+ # @return [Expr]
166
+ #
167
+ # @example
168
+ # df = Polars::DataFrame.new({"foo" => [1, nil, 2]})
169
+ # df.select(Polars.col("foo").str.concat("-"))
170
+ # # =>
171
+ # # shape: (1, 1)
172
+ # # ┌──────────┐
173
+ # # │ foo │
174
+ # # │ --- │
175
+ # # │ str │
176
+ # # ╞══════════╡
177
+ # # │ 1-null-2 │
178
+ # # └──────────┘
20
179
  def concat(delimiter = "-")
21
180
  Utils.wrap_expr(_rbexpr.str_concat(delimiter))
22
181
  end
23
182
 
183
+ # Transform to uppercase variant.
184
+ #
185
+ # @return [Expr]
186
+ #
187
+ # @example
188
+ # df = Polars::DataFrame.new({"foo" => ["cat", "dog"]})
189
+ # df.select(Polars.col("foo").str.to_uppercase)
190
+ # # =>
191
+ # # shape: (2, 1)
192
+ # # ┌─────┐
193
+ # # │ foo │
194
+ # # │ --- │
195
+ # # │ str │
196
+ # # ╞═════╡
197
+ # # │ CAT │
198
+ # # ├╌╌╌╌╌┤
199
+ # # │ DOG │
200
+ # # └─────┘
24
201
  def to_uppercase
25
202
  Utils.wrap_expr(_rbexpr.str_to_uppercase)
26
203
  end
27
204
 
205
+ # Transform to lowercase variant.
206
+ #
207
+ # @return [Expr]
208
+ #
209
+ # @example
210
+ # df = Polars::DataFrame.new({"foo" => ["CAT", "DOG"]})
211
+ # df.select(Polars.col("foo").str.to_lowercase)
212
+ # # =>
213
+ # # shape: (2, 1)
214
+ # # ┌─────┐
215
+ # # │ foo │
216
+ # # │ --- │
217
+ # # │ str │
218
+ # # ╞═════╡
219
+ # # │ cat │
220
+ # # ├╌╌╌╌╌┤
221
+ # # │ dog │
222
+ # # └─────┘
28
223
  def to_lowercase
29
224
  Utils.wrap_expr(_rbexpr.str_to_lowercase)
30
225
  end
31
226
 
227
+ # Remove leading and trailing whitespace.
228
+ #
229
+ # @param matches [String, nil]
230
+ # An optional single character that should be trimmed.
231
+ #
232
+ # @return [Expr]
233
+ #
234
+ # @example
235
+ # df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
236
+ # df.select(Polars.col("foo").str.strip)
237
+ # # =>
238
+ # # shape: (3, 1)
239
+ # # ┌───────┐
240
+ # # │ foo │
241
+ # # │ --- │
242
+ # # │ str │
243
+ # # ╞═══════╡
244
+ # # │ lead │
245
+ # # ├╌╌╌╌╌╌╌┤
246
+ # # │ trail │
247
+ # # ├╌╌╌╌╌╌╌┤
248
+ # # │ both │
249
+ # # └───────┘
32
250
  def strip(matches = nil)
33
251
  if !matches.nil? && matches.length > 1
34
252
  raise ArgumentError, "matches should contain a single character"
@@ -36,6 +254,29 @@ module Polars
36
254
  Utils.wrap_expr(_rbexpr.str_strip(matches))
37
255
  end
38
256
 
257
+ # Remove leading whitespace.
258
+ #
259
+ # @param matches [String, nil]
260
+ # An optional single character that should be trimmed.
261
+ #
262
+ # @return [Expr]
263
+ #
264
+ # @example
265
+ # df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
266
+ # df.select(Polars.col("foo").str.lstrip)
267
+ # # =>
268
+ # # shape: (3, 1)
269
+ # # ┌────────┐
270
+ # # │ foo │
271
+ # # │ --- │
272
+ # # │ str │
273
+ # # ╞════════╡
274
+ # # │ lead │
275
+ # # ├╌╌╌╌╌╌╌╌┤
276
+ # # │ trail │
277
+ # # ├╌╌╌╌╌╌╌╌┤
278
+ # # │ both │
279
+ # # └────────┘
39
280
  def lstrip(matches = nil)
40
281
  if !matches.nil? && matches.length > 1
41
282
  raise ArgumentError, "matches should contain a single character"
@@ -43,6 +284,29 @@ module Polars
43
284
  Utils.wrap_expr(_rbexpr.str_lstrip(matches))
44
285
  end
45
286
 
287
+ # Remove trailing whitespace.
288
+ #
289
+ # @param matches [String, nil]
290
+ # An optional single character that should be trimmed.
291
+ #
292
+ # @return [Expr]
293
+ #
294
+ # @example
295
+ # df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
296
+ # df.select(Polars.col("foo").str.rstrip)
297
+ # # =>
298
+ # # shape: (3, 1)
299
+ # # ┌───────┐
300
+ # # │ foo │
301
+ # # │ --- │
302
+ # # │ str │
303
+ # # ╞═══════╡
304
+ # # │ lead │
305
+ # # ├╌╌╌╌╌╌╌┤
306
+ # # │ trail │
307
+ # # ├╌╌╌╌╌╌╌┤
308
+ # # │ both │
309
+ # # └───────┘
46
310
  def rstrip(matches = nil)
47
311
  if !matches.nil? && matches.length > 1
48
312
  raise ArgumentError, "matches should contain a single character"
@@ -50,51 +314,474 @@ module Polars
50
314
  Utils.wrap_expr(_rbexpr.str_rstrip(matches))
51
315
  end
52
316
 
317
+ # Fills the string with zeroes.
318
+ #
319
+ # Return a copy of the string left filled with ASCII '0' digits to make a string
320
+ # of length width.
321
+ #
322
+ # A leading sign prefix ('+'/'-') is handled by inserting the padding after the
323
+ # sign character rather than before. The original string is returned if width is
324
+ # less than or equal to `s.length`.
325
+ #
326
+ # @param alignment [Integer]
327
+ # Fill the value up to this length
328
+ #
329
+ # @return [Expr]
330
+ #
331
+ # @example
332
+ # df = Polars::DataFrame.new(
333
+ # {
334
+ # "num" => [-10, -1, 0, 1, 10, 100, 1000, 10000, 100000, 1000000, nil]
335
+ # }
336
+ # )
337
+ # df.with_column(Polars.col("num").cast(String).str.zfill(5))
338
+ # # =>
339
+ # # shape: (11, 1)
340
+ # # ┌─────────┐
341
+ # # │ num │
342
+ # # │ --- │
343
+ # # │ str │
344
+ # # ╞═════════╡
345
+ # # │ -0010 │
346
+ # # ├╌╌╌╌╌╌╌╌╌┤
347
+ # # │ -0001 │
348
+ # # ├╌╌╌╌╌╌╌╌╌┤
349
+ # # │ 00000 │
350
+ # # ├╌╌╌╌╌╌╌╌╌┤
351
+ # # │ 00001 │
352
+ # # ├╌╌╌╌╌╌╌╌╌┤
353
+ # # │ ... │
354
+ # # ├╌╌╌╌╌╌╌╌╌┤
355
+ # # │ 10000 │
356
+ # # ├╌╌╌╌╌╌╌╌╌┤
357
+ # # │ 100000 │
358
+ # # ├╌╌╌╌╌╌╌╌╌┤
359
+ # # │ 1000000 │
360
+ # # ├╌╌╌╌╌╌╌╌╌┤
361
+ # # │ null │
362
+ # # └─────────┘
53
363
  def zfill(alignment)
54
364
  Utils.wrap_expr(_rbexpr.str_zfill(alignment))
55
365
  end
56
366
 
367
+ # Return the string left justified in a string of length `width`.
368
+ #
369
+ # Padding is done using the specified `fillchar`.
370
+ # The original string is returned if `width` is less than or equal to
371
+ # `s.length`.
372
+ #
373
+ # @param width [Integer]
374
+ # Justify left to this length.
375
+ # @param fillchar [String]
376
+ # Fill with this ASCII character.
377
+ #
378
+ # @return [Expr]
379
+ #
380
+ # @example
381
+ # df = Polars::DataFrame.new({"a" => ["cow", "monkey", nil, "hippopotamus"]})
382
+ # df.select(Polars.col("a").str.ljust(8, "*"))
383
+ # # =>
384
+ # # shape: (4, 1)
385
+ # # ┌──────────────┐
386
+ # # │ a │
387
+ # # │ --- │
388
+ # # │ str │
389
+ # # ╞══════════════╡
390
+ # # │ cow***** │
391
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
392
+ # # │ monkey** │
393
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
394
+ # # │ null │
395
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
396
+ # # │ hippopotamus │
397
+ # # └──────────────┘
57
398
  def ljust(width, fillchar = " ")
58
399
  Utils.wrap_expr(_rbexpr.str_ljust(width, fillchar))
59
400
  end
60
401
 
402
+ # Return the string right justified in a string of length `width`.
403
+ #
404
+ # Padding is done using the specified `fillchar`.
405
+ # The original string is returned if `width` is less than or equal to
406
+ # `s.length`.
407
+ #
408
+ # @param width [Integer]
409
+ # Justify right to this length.
410
+ # @param fillchar [String]
411
+ # Fill with this ASCII character.
412
+ #
413
+ # @return [Expr]
414
+ #
415
+ # @example
416
+ # df = Polars::DataFrame.new({"a" => ["cow", "monkey", nil, "hippopotamus"]})
417
+ # df.select(Polars.col("a").str.rjust(8, "*"))
418
+ # # =>
419
+ # # shape: (4, 1)
420
+ # # ┌──────────────┐
421
+ # # │ a │
422
+ # # │ --- │
423
+ # # │ str │
424
+ # # ╞══════════════╡
425
+ # # │ *****cow │
426
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
427
+ # # │ **monkey │
428
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
429
+ # # │ null │
430
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
431
+ # # │ hippopotamus │
432
+ # # └──────────────┘
61
433
  def rjust(width, fillchar = " ")
62
434
  Utils.wrap_expr(_rbexpr.str_rjust(width, fillchar))
63
435
  end
64
436
 
437
+ # Check if string contains a substring that matches a regex.
438
+ #
439
+ # @param pattern [String]
440
+ # A valid regex pattern.
441
+ # @param literal [Boolean]
442
+ # Treat pattern as a literal string.
443
+ #
444
+ # @return [Expr]
445
+ #
446
+ # @example
447
+ # df = Polars::DataFrame.new({"a" => ["Crab", "cat and dog", "rab$bit", nil]})
448
+ # df.select(
449
+ # [
450
+ # Polars.col("a"),
451
+ # Polars.col("a").str.contains("cat|bit").alias("regex"),
452
+ # Polars.col("a").str.contains("rab$", literal: true).alias("literal")
453
+ # ]
454
+ # )
455
+ # # =>
456
+ # # shape: (4, 3)
457
+ # # ┌─────────────┬───────┬─────────┐
458
+ # # │ a ┆ regex ┆ literal │
459
+ # # │ --- ┆ --- ┆ --- │
460
+ # # │ str ┆ bool ┆ bool │
461
+ # # ╞═════════════╪═══════╪═════════╡
462
+ # # │ Crab ┆ false ┆ false │
463
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
464
+ # # │ cat and dog ┆ true ┆ false │
465
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
466
+ # # │ rab$bit ┆ true ┆ true │
467
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
468
+ # # │ null ┆ null ┆ null │
469
+ # # └─────────────┴───────┴─────────┘
65
470
  def contains(pattern, literal: false)
66
471
  Utils.wrap_expr(_rbexpr.str_contains(pattern, literal))
67
472
  end
68
473
 
474
+ # Check if string values end with a substring.
475
+ #
476
+ # @param sub [String]
477
+ # Suffix substring.
478
+ #
479
+ # @return [Expr]
480
+ #
481
+ # @example
482
+ # df = Polars::DataFrame.new({"fruits" => ["apple", "mango", nil]})
483
+ # df.with_column(
484
+ # Polars.col("fruits").str.ends_with("go").alias("has_suffix")
485
+ # )
486
+ # # =>
487
+ # # shape: (3, 2)
488
+ # # ┌────────┬────────────┐
489
+ # # │ fruits ┆ has_suffix │
490
+ # # │ --- ┆ --- │
491
+ # # │ str ┆ bool │
492
+ # # ╞════════╪════════════╡
493
+ # # │ apple ┆ false │
494
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
495
+ # # │ mango ┆ true │
496
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
497
+ # # │ null ┆ null │
498
+ # # └────────┴────────────┘
499
+ #
500
+ # @example Using `ends_with` as a filter condition:
501
+ # df.filter(Polars.col("fruits").str.ends_with("go"))
502
+ # # =>
503
+ # # shape: (1, 1)
504
+ # # ┌────────┐
505
+ # # │ fruits │
506
+ # # │ --- │
507
+ # # │ str │
508
+ # # ╞════════╡
509
+ # # │ mango │
510
+ # # └────────┘
69
511
  def ends_with(sub)
70
512
  Utils.wrap_expr(_rbexpr.str_ends_with(sub))
71
513
  end
72
514
 
515
+ # Check if string values start with a substring.
516
+ #
517
+ # @param sub [String]
518
+ # Prefix substring.
519
+ #
520
+ # @return [Expr]
521
+ #
522
+ # @example
523
+ # df = Polars::DataFrame.new({"fruits" => ["apple", "mango", nil]})
524
+ # df.with_column(
525
+ # Polars.col("fruits").str.starts_with("app").alias("has_prefix")
526
+ # )
527
+ # # =>
528
+ # # shape: (3, 2)
529
+ # # ┌────────┬────────────┐
530
+ # # │ fruits ┆ has_prefix │
531
+ # # │ --- ┆ --- │
532
+ # # │ str ┆ bool │
533
+ # # ╞════════╪════════════╡
534
+ # # │ apple ┆ true │
535
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
536
+ # # │ mango ┆ false │
537
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
538
+ # # │ null ┆ null │
539
+ # # └────────┴────────────┘
540
+ #
541
+ # @example Using `starts_with` as a filter condition:
542
+ # df.filter(Polars.col("fruits").str.starts_with("app"))
543
+ # # =>
544
+ # # shape: (1, 1)
545
+ # # ┌────────┐
546
+ # # │ fruits │
547
+ # # │ --- │
548
+ # # │ str │
549
+ # # ╞════════╡
550
+ # # │ apple │
551
+ # # └────────┘
73
552
  def starts_with(sub)
74
553
  Utils.wrap_expr(_rbexpr.str_starts_with(sub))
75
554
  end
76
555
 
77
- # def json_path_match
78
- # end
556
+ # Extract the first match of json string with provided JSONPath expression.
557
+ #
558
+ # Throw errors if encounter invalid json strings.
559
+ # All return value will be casted to Utf8 regardless of the original value.
560
+ #
561
+ # Documentation on JSONPath standard can be found
562
+ # [here](https://goessner.net/articles/JsonPath/).
563
+ #
564
+ # @param json_path [String]
565
+ # A valid JSON path query string.
566
+ #
567
+ # @return [Expr]
568
+ #
569
+ # @example
570
+ # df = Polars::DataFrame.new(
571
+ # {"json_val" => ['{"a":"1"}', nil, '{"a":2}', '{"a":2.1}', '{"a":true}']}
572
+ # )
573
+ # df.select(Polars.col("json_val").str.json_path_match("$.a"))
574
+ # # =>
575
+ # # shape: (5, 1)
576
+ # # ┌──────────┐
577
+ # # │ json_val │
578
+ # # │ --- │
579
+ # # │ str │
580
+ # # ╞══════════╡
581
+ # # │ 1 │
582
+ # # ├╌╌╌╌╌╌╌╌╌╌┤
583
+ # # │ null │
584
+ # # ├╌╌╌╌╌╌╌╌╌╌┤
585
+ # # │ 2 │
586
+ # # ├╌╌╌╌╌╌╌╌╌╌┤
587
+ # # │ 2.1 │
588
+ # # ├╌╌╌╌╌╌╌╌╌╌┤
589
+ # # │ true │
590
+ # # └──────────┘
591
+ def json_path_match(json_path)
592
+ Utils.wrap_expr(_rbexpr.str_json_path_match(json_path))
593
+ end
79
594
 
80
- # def decode
81
- # end
595
+ # Decode a value using the provided encoding.
596
+ #
597
+ # @param encoding ["hex", "base64"]
598
+ # The encoding to use.
599
+ # @param strict [Boolean]
600
+ # How to handle invalid inputs:
601
+ #
602
+ # - `true`: An error will be thrown if unable to decode a value.
603
+ # - `false`: Unhandled values will be replaced with `nil`.
604
+ #
605
+ # @return [Expr]
606
+ #
607
+ # @example
608
+ # df = Polars::DataFrame.new({"encoded" => ["666f6f", "626172", nil]})
609
+ # df.select(Polars.col("encoded").str.decode("hex"))
610
+ # # =>
611
+ # # shape: (3, 1)
612
+ # # ┌─────────┐
613
+ # # │ encoded │
614
+ # # │ --- │
615
+ # # │ str │
616
+ # # ╞═════════╡
617
+ # # │ foo │
618
+ # # ├╌╌╌╌╌╌╌╌╌┤
619
+ # # │ bar │
620
+ # # ├╌╌╌╌╌╌╌╌╌┤
621
+ # # │ null │
622
+ # # └─────────┘
623
+ def decode(encoding, strict: false)
624
+ if encoding == "hex"
625
+ Utils.wrap_expr(_rbexpr.str_hex_decode(strict))
626
+ elsif encoding == "base64"
627
+ Utils.wrap_expr(_rbexpr.str_base64_decode(strict))
628
+ else
629
+ raise ArgumentError, "encoding must be one of {{'hex', 'base64'}}, got #{encoding}"
630
+ end
631
+ end
82
632
 
83
- # def encode
84
- # end
633
+ # Encode a value using the provided encoding.
634
+ #
635
+ # @param encoding ["hex", "base64"]
636
+ # The encoding to use.
637
+ #
638
+ # @return [Expr]
639
+ #
640
+ # @example
641
+ # df = Polars::DataFrame.new({"strings" => ["foo", "bar", nil]})
642
+ # df.select(Polars.col("strings").str.encode("hex"))
643
+ # # =>
644
+ # # shape: (3, 1)
645
+ # # ┌─────────┐
646
+ # # │ strings │
647
+ # # │ --- │
648
+ # # │ str │
649
+ # # ╞═════════╡
650
+ # # │ 666f6f │
651
+ # # ├╌╌╌╌╌╌╌╌╌┤
652
+ # # │ 626172 │
653
+ # # ├╌╌╌╌╌╌╌╌╌┤
654
+ # # │ null │
655
+ # # └─────────┘
656
+ def encode(encoding)
657
+ if encoding == "hex"
658
+ Utils.wrap_expr(_rbexpr.str_hex_encode)
659
+ elsif encoding == "base64"
660
+ Utils.wrap_expr(_rbexpr.str_base64_encode)
661
+ else
662
+ raise ArgumentError, "encoding must be one of {{'hex', 'base64'}}, got #{encoding}"
663
+ end
664
+ end
85
665
 
666
+ # Extract the target capture group from provided patterns.
667
+ #
668
+ # @param pattern [String]
669
+ # A valid regex pattern
670
+ # @param group_index [Integer]
671
+ # Index of the targeted capture group.
672
+ # Group 0 mean the whole pattern, first group begin at index 1
673
+ # Default to the first capture group
674
+ #
675
+ # @return [Expr]
676
+ #
677
+ # @example
678
+ # df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
679
+ # df.select(
680
+ # [
681
+ # Polars.col("foo").str.extract('(\d+)')
682
+ # ]
683
+ # )
684
+ # # =>
685
+ # # shape: (2, 1)
686
+ # # ┌─────┐
687
+ # # │ foo │
688
+ # # │ --- │
689
+ # # │ str │
690
+ # # ╞═════╡
691
+ # # │ 123 │
692
+ # # ├╌╌╌╌╌┤
693
+ # # │ 678 │
694
+ # # └─────┘
86
695
  def extract(pattern, group_index: 1)
87
696
  Utils.wrap_expr(_rbexpr.str_extract(pattern, group_index))
88
697
  end
89
698
 
699
+ # Extracts all matches for the given regex pattern.
700
+ #
701
+ # Extracts each successive non-overlapping regex match in an individual string as
702
+ # an array.
703
+ #
704
+ # @param pattern [String]
705
+ # A valid regex pattern
706
+ #
707
+ # @return [Expr]
708
+ #
709
+ # @example
710
+ # df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
711
+ # df.select(
712
+ # [
713
+ # Polars.col("foo").str.extract_all('(\d+)').alias("extracted_nrs")
714
+ # ]
715
+ # )
716
+ # # =>
717
+ # # shape: (2, 1)
718
+ # # ┌────────────────┐
719
+ # # │ extracted_nrs │
720
+ # # │ --- │
721
+ # # │ list[str] │
722
+ # # ╞════════════════╡
723
+ # # │ ["123", "45"] │
724
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
725
+ # # │ ["678", "910"] │
726
+ # # └────────────────┘
90
727
  def extract_all(pattern)
91
728
  Utils.wrap_expr(_rbexpr.str_extract_all(pattern))
92
729
  end
93
730
 
731
+ # Count all successive non-overlapping regex matches.
732
+ #
733
+ # @param pattern [String]
734
+ # A valid regex pattern
735
+ #
736
+ # @return [Expr]
737
+ #
738
+ # @example
739
+ # df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
740
+ # df.select(
741
+ # [
742
+ # Polars.col("foo").str.count_match('\d').alias("count_digits")
743
+ # ]
744
+ # )
745
+ # # =>
746
+ # # shape: (2, 1)
747
+ # # ┌──────────────┐
748
+ # # │ count_digits │
749
+ # # │ --- │
750
+ # # │ u32 │
751
+ # # ╞══════════════╡
752
+ # # │ 5 │
753
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
754
+ # # │ 6 │
755
+ # # └──────────────┘
94
756
  def count_match(pattern)
95
757
  Utils.wrap_expr(_rbexpr.count_match(pattern))
96
758
  end
97
759
 
760
+ # Split the string by a substring.
761
+ #
762
+ # @param by [String]
763
+ # Substring to split by.
764
+ # @param inclusive [Boolean]
765
+ # If true, include the split character/string in the results.
766
+ #
767
+ # @return [Expr]
768
+ #
769
+ # @example
770
+ # df = Polars::DataFrame.new({"s" => ["foo bar", "foo-bar", "foo bar baz"]})
771
+ # df.select(Polars.col("s").str.split(" "))
772
+ # # =>
773
+ # # shape: (3, 1)
774
+ # # ┌───────────────────────┐
775
+ # # │ s │
776
+ # # │ --- │
777
+ # # │ list[str] │
778
+ # # ╞═══════════════════════╡
779
+ # # │ ["foo", "bar"] │
780
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
781
+ # # │ ["foo-bar"] │
782
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
783
+ # # │ ["foo", "bar", "baz"] │
784
+ # # └───────────────────────┘
98
785
  def split(by, inclusive: false)
99
786
  if inclusive
100
787
  Utils.wrap_expr(_rbexpr.str_split_inclusive(by))
@@ -103,6 +790,43 @@ module Polars
103
790
  end
104
791
  end
105
792
 
793
+ # Split the string by a substring using `n` splits.
794
+ #
795
+ # Results in a struct of `n+1` fields.
796
+ #
797
+ # If it cannot make `n` splits, the remaining field elements will be null.
798
+ #
799
+ # @param by [String]
800
+ # Substring to split by.
801
+ # @param n [Integer]
802
+ # Number of splits to make.
803
+ # @param inclusive [Boolean]
804
+ # If true, include the split character/string in the results.
805
+ #
806
+ # @return [Expr]
807
+ #
808
+ # @example
809
+ # df = Polars::DataFrame.new({"x" => ["a_1", nil, "c", "d_4"]})
810
+ # df.select(
811
+ # [
812
+ # Polars.col("x").str.split_exact("_", 1).alias("fields")
813
+ # ]
814
+ # )
815
+ # # =>
816
+ # # shape: (4, 1)
817
+ # # ┌─────────────┐
818
+ # # │ fields │
819
+ # # │ --- │
820
+ # # │ struct[2] │
821
+ # # ╞═════════════╡
822
+ # # │ {"a","1"} │
823
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┤
824
+ # # │ {null,null} │
825
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┤
826
+ # # │ {"c",null} │
827
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┤
828
+ # # │ {"d","4"} │
829
+ # # └─────────────┘
106
830
  def split_exact(by, n, inclusive: false)
107
831
  if inclusive
108
832
  Utils.wrap_expr(_rbexpr.str_split_exact_inclusive(by, n))
@@ -111,22 +835,135 @@ module Polars
111
835
  end
112
836
  end
113
837
 
838
+ # Split the string by a substring, restricted to returning at most `n` items.
839
+ #
840
+ # If the number of possible splits is less than `n-1`, the remaining field
841
+ # elements will be null. If the number of possible splits is `n-1` or greater,
842
+ # the last (nth) substring will contain the remainder of the string.
843
+ #
844
+ # @param by [String]
845
+ # Substring to split by.
846
+ # @param n [Integer]
847
+ # Max number of items to return.
848
+ #
849
+ # @return [Expr]
850
+ #
851
+ # @example
852
+ # df = Polars::DataFrame.new({"s" => ["foo bar", nil, "foo-bar", "foo bar baz"]})
853
+ # df.select(Polars.col("s").str.splitn(" ", 2).alias("fields"))
854
+ # # =>
855
+ # # shape: (4, 1)
856
+ # # ┌───────────────────┐
857
+ # # │ fields │
858
+ # # │ --- │
859
+ # # │ struct[2] │
860
+ # # ╞═══════════════════╡
861
+ # # │ {"foo","bar"} │
862
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
863
+ # # │ {null,null} │
864
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
865
+ # # │ {"foo-bar",null} │
866
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
867
+ # # │ {"foo","bar baz"} │
868
+ # # └───────────────────┘
114
869
  def splitn(by, n)
115
870
  Utils.wrap_expr(_rbexpr.str_splitn(by, n))
116
871
  end
117
872
 
118
- def replace(pattern, literal: false)
873
+ # Replace first matching regex/literal substring with a new string value.
874
+ #
875
+ # @param pattern [String]
876
+ # Regex pattern.
877
+ # @param value [String]
878
+ # Replacement string.
879
+ # @param literal [Boolean]
880
+ # Treat pattern as a literal string.
881
+ #
882
+ # @return [Expr]
883
+ #
884
+ # @example
885
+ # df = Polars::DataFrame.new({"id" => [1, 2], "text" => ["123abc", "abc456"]})
886
+ # df.with_column(
887
+ # Polars.col("text").str.replace('abc\b', "ABC")
888
+ # )
889
+ # # =>
890
+ # # shape: (2, 2)
891
+ # # ┌─────┬────────┐
892
+ # # │ id ┆ text │
893
+ # # │ --- ┆ --- │
894
+ # # │ i64 ┆ str │
895
+ # # ╞═════╪════════╡
896
+ # # │ 1 ┆ 123ABC │
897
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
898
+ # # │ 2 ┆ abc456 │
899
+ # # └─────┴────────┘
900
+ def replace(pattern, value, literal: false)
119
901
  pattern = Utils.expr_to_lit_or_expr(pattern, str_to_lit: true)
120
902
  value = Utils.expr_to_lit_or_expr(value, str_to_lit: true)
121
903
  Utils.wrap_expr(_rbexpr.str_replace(pattern._rbexpr, value._rbexpr, literal))
122
904
  end
123
905
 
124
- def replace_all(pattern, literal: false)
906
+ # Replace all matching regex/literal substrings with a new string value.
907
+ #
908
+ # @param pattern [String]
909
+ # Regex pattern.
910
+ # @param value [String]
911
+ # Replacement string.
912
+ # @param literal [Boolean]
913
+ # Treat pattern as a literal string.
914
+ #
915
+ # @return [Expr]
916
+ #
917
+ # @example
918
+ # df = Polars::DataFrame.new({"id" => [1, 2], "text" => ["abcabc", "123a123"]})
919
+ # df.with_column(Polars.col("text").str.replace_all("a", "-"))
920
+ # # =>
921
+ # # shape: (2, 2)
922
+ # # ┌─────┬─────────┐
923
+ # # │ id ┆ text │
924
+ # # │ --- ┆ --- │
925
+ # # │ i64 ┆ str │
926
+ # # ╞═════╪═════════╡
927
+ # # │ 1 ┆ -bc-bc │
928
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
929
+ # # │ 2 ┆ 123-123 │
930
+ # # └─────┴─────────┘
931
+ def replace_all(pattern, value, literal: false)
125
932
  pattern = Utils.expr_to_lit_or_expr(pattern, str_to_lit: true)
126
933
  value = Utils.expr_to_lit_or_expr(value, str_to_lit: true)
127
934
  Utils.wrap_expr(_rbexpr.str_replace_all(pattern._rbexpr, value._rbexpr, literal))
128
935
  end
129
936
 
937
+ # Create subslices of the string values of a Utf8 Series.
938
+ #
939
+ # @param offset [Integer]
940
+ # Start index. Negative indexing is supported.
941
+ # @param length [Integer]
942
+ # Length of the slice. If set to `nil` (default), the slice is taken to the
943
+ # end of the string.
944
+ #
945
+ # @return [Expr]
946
+ #
947
+ # @example
948
+ # df = Polars::DataFrame.new({"s" => ["pear", nil, "papaya", "dragonfruit"]})
949
+ # df.with_column(
950
+ # Polars.col("s").str.slice(-3).alias("s_sliced")
951
+ # )
952
+ # # =>
953
+ # # shape: (4, 2)
954
+ # # ┌─────────────┬──────────┐
955
+ # # │ s ┆ s_sliced │
956
+ # # │ --- ┆ --- │
957
+ # # │ str ┆ str │
958
+ # # ╞═════════════╪══════════╡
959
+ # # │ pear ┆ ear │
960
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
961
+ # # │ null ┆ null │
962
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
963
+ # # │ papaya ┆ aya │
964
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
965
+ # # │ dragonfruit ┆ uit │
966
+ # # └─────────────┴──────────┘
130
967
  def slice(offset, length = nil)
131
968
  Utils.wrap_expr(_rbexpr.str_slice(offset, length))
132
969
  end