polars-df 0.1.2 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +3 -0
  3. data/CHANGELOG.md +9 -0
  4. data/Cargo.lock +74 -3
  5. data/Cargo.toml +3 -0
  6. data/README.md +1 -1
  7. data/ext/polars/Cargo.toml +18 -1
  8. data/ext/polars/src/conversion.rs +115 -2
  9. data/ext/polars/src/dataframe.rs +228 -11
  10. data/ext/polars/src/error.rs +4 -0
  11. data/ext/polars/src/lazy/dataframe.rs +5 -5
  12. data/ext/polars/src/lazy/dsl.rs +157 -2
  13. data/ext/polars/src/lib.rs +185 -10
  14. data/ext/polars/src/list_construction.rs +100 -0
  15. data/ext/polars/src/series.rs +217 -29
  16. data/ext/polars/src/set.rs +91 -0
  17. data/ext/polars/src/utils.rs +19 -0
  18. data/lib/polars/batched_csv_reader.rb +1 -0
  19. data/lib/polars/cat_expr.rb +39 -0
  20. data/lib/polars/cat_name_space.rb +54 -0
  21. data/lib/polars/data_frame.rb +2384 -140
  22. data/lib/polars/date_time_expr.rb +1282 -7
  23. data/lib/polars/date_time_name_space.rb +1484 -0
  24. data/lib/polars/exceptions.rb +20 -0
  25. data/lib/polars/expr.rb +4374 -53
  26. data/lib/polars/expr_dispatch.rb +22 -0
  27. data/lib/polars/functions.rb +219 -0
  28. data/lib/polars/group_by.rb +518 -0
  29. data/lib/polars/io.rb +421 -2
  30. data/lib/polars/lazy_frame.rb +1267 -69
  31. data/lib/polars/lazy_functions.rb +412 -24
  32. data/lib/polars/lazy_group_by.rb +80 -0
  33. data/lib/polars/list_expr.rb +507 -5
  34. data/lib/polars/list_name_space.rb +346 -0
  35. data/lib/polars/meta_expr.rb +21 -0
  36. data/lib/polars/series.rb +2256 -242
  37. data/lib/polars/slice.rb +104 -0
  38. data/lib/polars/string_expr.rb +847 -10
  39. data/lib/polars/string_name_space.rb +690 -0
  40. data/lib/polars/struct_expr.rb +73 -0
  41. data/lib/polars/struct_name_space.rb +64 -0
  42. data/lib/polars/utils.rb +71 -3
  43. data/lib/polars/version.rb +2 -1
  44. data/lib/polars/when.rb +1 -0
  45. data/lib/polars/when_then.rb +1 -0
  46. data/lib/polars.rb +12 -10
  47. metadata +15 -2
@@ -1,34 +1,252 @@
1
1
  module Polars
2
+ # Namespace for string related expressions.
2
3
  class StringExpr
4
+ # @private
3
5
  attr_accessor :_rbexpr
4
6
 
7
+ # @private
5
8
  def initialize(expr)
6
9
  self._rbexpr = expr._rbexpr
7
10
  end
8
11
 
9
- # def strptime
10
- # end
12
+ # Parse a Utf8 expression to a Date/Datetime/Time type.
13
+ #
14
+ # @param datatype [Symbol]
15
+ # `:date`, `:dateime`, or `:time`.
16
+ # @param fmt [String]
17
+ # Format to use, refer to the
18
+ # [chrono strftime documentation](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
19
+ # for specification. Example: `"%y-%m-%d"`.
20
+ # @param strict [Boolean]
21
+ # Raise an error if any conversion fails.
22
+ # @param exact [Boolean]
23
+ # - If true, require an exact format match.
24
+ # - If false, allow the format to match anywhere in the target string.
25
+ #
26
+ # @return [Expr]
27
+ #
28
+ # @note
29
+ # When parsing a Datetime the column precision will be inferred from
30
+ # the format string, if given, eg: "%F %T%.3f" => Datetime("ms"). If
31
+ # no fractional second component is found then the default is "us".
32
+ #
33
+ # @example
34
+ # s = Polars::Series.new(
35
+ # "date",
36
+ # [
37
+ # "2021-04-22",
38
+ # "2022-01-04 00:00:00",
39
+ # "01/31/22",
40
+ # "Sun Jul 8 00:34:60 2001"
41
+ # ]
42
+ # )
43
+ # s.to_frame.with_column(
44
+ # Polars.col("date")
45
+ # .str.strptime(:date, "%F", strict: false)
46
+ # .fill_null(
47
+ # Polars.col("date").str.strptime(:date, "%F %T", strict: false)
48
+ # )
49
+ # .fill_null(Polars.col("date").str.strptime(:date, "%D", strict: false))
50
+ # .fill_null(Polars.col("date").str.strptime(:date, "%c", strict: false))
51
+ # )
52
+ # # =>
53
+ # # shape: (4, 1)
54
+ # # ┌────────────┐
55
+ # # │ date │
56
+ # # │ --- │
57
+ # # │ date │
58
+ # # ╞════════════╡
59
+ # # │ 2021-04-22 │
60
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┤
61
+ # # │ 2022-01-04 │
62
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┤
63
+ # # │ 2022-01-31 │
64
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌┤
65
+ # # │ 2001-07-08 │
66
+ # # └────────────┘
67
+ def strptime(datatype, fmt = nil, strict: true, exact: true)
68
+ if !Utils.is_polars_dtype(datatype)
69
+ raise ArgumentError, "expected: {DataType} got: #{datatype}"
70
+ end
71
+
72
+ if datatype == :date
73
+ Utils.wrap_expr(_rbexpr.str_parse_date(fmt, strict, exact))
74
+ elsif datatype == :datetime
75
+ # TODO fix
76
+ tu = nil # datatype.tu
77
+ dtcol = Utils.wrap_expr(_rbexpr.str_parse_datetime(fmt, strict, exact))
78
+ if tu.nil?
79
+ dtcol
80
+ else
81
+ dtcol.dt.cast_time_unit(tu)
82
+ end
83
+ elsif datatype == :time
84
+ Utils.wrap_expr(_rbexpr.str_parse_time(fmt, strict, exact))
85
+ else
86
+ raise ArgumentError, "dtype should be of type :date, :datetime, or :time"
87
+ end
88
+ end
11
89
 
90
+ # Get length of the strings as `:u32` (as number of bytes).
91
+ #
92
+ # @return [Expr]
93
+ #
94
+ # @note
95
+ # The returned lengths are equal to the number of bytes in the UTF8 string. If you
96
+ # need the length in terms of the number of characters, use `n_chars` instead.
97
+ #
98
+ # @example
99
+ # df = Polars::DataFrame.new({"s" => ["Café", nil, "345", "東京"]}).with_columns(
100
+ # [
101
+ # Polars.col("s").str.lengths.alias("length"),
102
+ # Polars.col("s").str.n_chars.alias("nchars")
103
+ # ]
104
+ # )
105
+ # df
106
+ # # =>
107
+ # # shape: (4, 3)
108
+ # # ┌──────┬────────┬────────┐
109
+ # # │ s ┆ length ┆ nchars │
110
+ # # │ --- ┆ --- ┆ --- │
111
+ # # │ str ┆ u32 ┆ u32 │
112
+ # # ╞══════╪════════╪════════╡
113
+ # # │ Café ┆ 5 ┆ 4 │
114
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
115
+ # # │ null ┆ null ┆ null │
116
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
117
+ # # │ 345 ┆ 3 ┆ 3 │
118
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
119
+ # # │ 東京 ┆ 6 ┆ 2 │
120
+ # # └──────┴────────┴────────┘
12
121
  def lengths
13
122
  Utils.wrap_expr(_rbexpr.str_lengths)
14
123
  end
15
124
 
125
+ # Get length of the strings as `:u32` (as number of chars).
126
+ #
127
+ # @return [Expr]
128
+ #
129
+ # @note
130
+ # If you know that you are working with ASCII text, `lengths` will be
131
+ # equivalent, and faster (returns length in terms of the number of bytes).
132
+ #
133
+ # @example
134
+ # df = Polars::DataFrame.new({"s" => ["Café", nil, "345", "東京"]}).with_columns(
135
+ # [
136
+ # Polars.col("s").str.lengths.alias("length"),
137
+ # Polars.col("s").str.n_chars.alias("nchars")
138
+ # ]
139
+ # )
140
+ # df
141
+ # # =>
142
+ # # shape: (4, 3)
143
+ # # ┌──────┬────────┬────────┐
144
+ # # │ s ┆ length ┆ nchars │
145
+ # # │ --- ┆ --- ┆ --- │
146
+ # # │ str ┆ u32 ┆ u32 │
147
+ # # ╞══════╪════════╪════════╡
148
+ # # │ Café ┆ 5 ┆ 4 │
149
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
150
+ # # │ null ┆ null ┆ null │
151
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
152
+ # # │ 345 ┆ 3 ┆ 3 │
153
+ # # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
154
+ # # │ 東京 ┆ 6 ┆ 2 │
155
+ # # └──────┴────────┴────────┘
16
156
  def n_chars
17
157
  Utils.wrap_expr(_rbexpr.str_n_chars)
18
158
  end
19
159
 
160
+ # Vertically concat the values in the Series to a single string value.
161
+ #
162
+ # @param delimiter [String]
163
+ # The delimiter to insert between consecutive string values.
164
+ #
165
+ # @return [Expr]
166
+ #
167
+ # @example
168
+ # df = Polars::DataFrame.new({"foo" => [1, nil, 2]})
169
+ # df.select(Polars.col("foo").str.concat("-"))
170
+ # # =>
171
+ # # shape: (1, 1)
172
+ # # ┌──────────┐
173
+ # # │ foo │
174
+ # # │ --- │
175
+ # # │ str │
176
+ # # ╞══════════╡
177
+ # # │ 1-null-2 │
178
+ # # └──────────┘
20
179
  def concat(delimiter = "-")
21
180
  Utils.wrap_expr(_rbexpr.str_concat(delimiter))
22
181
  end
23
182
 
183
+ # Transform to uppercase variant.
184
+ #
185
+ # @return [Expr]
186
+ #
187
+ # @example
188
+ # df = Polars::DataFrame.new({"foo" => ["cat", "dog"]})
189
+ # df.select(Polars.col("foo").str.to_uppercase)
190
+ # # =>
191
+ # # shape: (2, 1)
192
+ # # ┌─────┐
193
+ # # │ foo │
194
+ # # │ --- │
195
+ # # │ str │
196
+ # # ╞═════╡
197
+ # # │ CAT │
198
+ # # ├╌╌╌╌╌┤
199
+ # # │ DOG │
200
+ # # └─────┘
24
201
  def to_uppercase
25
202
  Utils.wrap_expr(_rbexpr.str_to_uppercase)
26
203
  end
27
204
 
205
+ # Transform to lowercase variant.
206
+ #
207
+ # @return [Expr]
208
+ #
209
+ # @example
210
+ # df = Polars::DataFrame.new({"foo" => ["CAT", "DOG"]})
211
+ # df.select(Polars.col("foo").str.to_lowercase)
212
+ # # =>
213
+ # # shape: (2, 1)
214
+ # # ┌─────┐
215
+ # # │ foo │
216
+ # # │ --- │
217
+ # # │ str │
218
+ # # ╞═════╡
219
+ # # │ cat │
220
+ # # ├╌╌╌╌╌┤
221
+ # # │ dog │
222
+ # # └─────┘
28
223
  def to_lowercase
29
224
  Utils.wrap_expr(_rbexpr.str_to_lowercase)
30
225
  end
31
226
 
227
+ # Remove leading and trailing whitespace.
228
+ #
229
+ # @param matches [String, nil]
230
+ # An optional single character that should be trimmed.
231
+ #
232
+ # @return [Expr]
233
+ #
234
+ # @example
235
+ # df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
236
+ # df.select(Polars.col("foo").str.strip)
237
+ # # =>
238
+ # # shape: (3, 1)
239
+ # # ┌───────┐
240
+ # # │ foo │
241
+ # # │ --- │
242
+ # # │ str │
243
+ # # ╞═══════╡
244
+ # # │ lead │
245
+ # # ├╌╌╌╌╌╌╌┤
246
+ # # │ trail │
247
+ # # ├╌╌╌╌╌╌╌┤
248
+ # # │ both │
249
+ # # └───────┘
32
250
  def strip(matches = nil)
33
251
  if !matches.nil? && matches.length > 1
34
252
  raise ArgumentError, "matches should contain a single character"
@@ -36,6 +254,29 @@ module Polars
36
254
  Utils.wrap_expr(_rbexpr.str_strip(matches))
37
255
  end
38
256
 
257
+ # Remove leading whitespace.
258
+ #
259
+ # @param matches [String, nil]
260
+ # An optional single character that should be trimmed.
261
+ #
262
+ # @return [Expr]
263
+ #
264
+ # @example
265
+ # df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
266
+ # df.select(Polars.col("foo").str.lstrip)
267
+ # # =>
268
+ # # shape: (3, 1)
269
+ # # ┌────────┐
270
+ # # │ foo │
271
+ # # │ --- │
272
+ # # │ str │
273
+ # # ╞════════╡
274
+ # # │ lead │
275
+ # # ├╌╌╌╌╌╌╌╌┤
276
+ # # │ trail │
277
+ # # ├╌╌╌╌╌╌╌╌┤
278
+ # # │ both │
279
+ # # └────────┘
39
280
  def lstrip(matches = nil)
40
281
  if !matches.nil? && matches.length > 1
41
282
  raise ArgumentError, "matches should contain a single character"
@@ -43,6 +284,29 @@ module Polars
43
284
  Utils.wrap_expr(_rbexpr.str_lstrip(matches))
44
285
  end
45
286
 
287
+ # Remove trailing whitespace.
288
+ #
289
+ # @param matches [String, nil]
290
+ # An optional single character that should be trimmed.
291
+ #
292
+ # @return [Expr]
293
+ #
294
+ # @example
295
+ # df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
296
+ # df.select(Polars.col("foo").str.rstrip)
297
+ # # =>
298
+ # # shape: (3, 1)
299
+ # # ┌───────┐
300
+ # # │ foo │
301
+ # # │ --- │
302
+ # # │ str │
303
+ # # ╞═══════╡
304
+ # # │ lead │
305
+ # # ├╌╌╌╌╌╌╌┤
306
+ # # │ trail │
307
+ # # ├╌╌╌╌╌╌╌┤
308
+ # # │ both │
309
+ # # └───────┘
46
310
  def rstrip(matches = nil)
47
311
  if !matches.nil? && matches.length > 1
48
312
  raise ArgumentError, "matches should contain a single character"
@@ -50,51 +314,474 @@ module Polars
50
314
  Utils.wrap_expr(_rbexpr.str_rstrip(matches))
51
315
  end
52
316
 
317
+ # Fills the string with zeroes.
318
+ #
319
+ # Return a copy of the string left filled with ASCII '0' digits to make a string
320
+ # of length width.
321
+ #
322
+ # A leading sign prefix ('+'/'-') is handled by inserting the padding after the
323
+ # sign character rather than before. The original string is returned if width is
324
+ # less than or equal to `s.length`.
325
+ #
326
+ # @param alignment [Integer]
327
+ # Fill the value up to this length
328
+ #
329
+ # @return [Expr]
330
+ #
331
+ # @example
332
+ # df = Polars::DataFrame.new(
333
+ # {
334
+ # "num" => [-10, -1, 0, 1, 10, 100, 1000, 10000, 100000, 1000000, nil]
335
+ # }
336
+ # )
337
+ # df.with_column(Polars.col("num").cast(String).str.zfill(5))
338
+ # # =>
339
+ # # shape: (11, 1)
340
+ # # ┌─────────┐
341
+ # # │ num │
342
+ # # │ --- │
343
+ # # │ str │
344
+ # # ╞═════════╡
345
+ # # │ -0010 │
346
+ # # ├╌╌╌╌╌╌╌╌╌┤
347
+ # # │ -0001 │
348
+ # # ├╌╌╌╌╌╌╌╌╌┤
349
+ # # │ 00000 │
350
+ # # ├╌╌╌╌╌╌╌╌╌┤
351
+ # # │ 00001 │
352
+ # # ├╌╌╌╌╌╌╌╌╌┤
353
+ # # │ ... │
354
+ # # ├╌╌╌╌╌╌╌╌╌┤
355
+ # # │ 10000 │
356
+ # # ├╌╌╌╌╌╌╌╌╌┤
357
+ # # │ 100000 │
358
+ # # ├╌╌╌╌╌╌╌╌╌┤
359
+ # # │ 1000000 │
360
+ # # ├╌╌╌╌╌╌╌╌╌┤
361
+ # # │ null │
362
+ # # └─────────┘
53
363
  def zfill(alignment)
54
364
  Utils.wrap_expr(_rbexpr.str_zfill(alignment))
55
365
  end
56
366
 
367
+ # Return the string left justified in a string of length `width`.
368
+ #
369
+ # Padding is done using the specified `fillchar`.
370
+ # The original string is returned if `width` is less than or equal to
371
+ # `s.length`.
372
+ #
373
+ # @param width [Integer]
374
+ # Justify left to this length.
375
+ # @param fillchar [String]
376
+ # Fill with this ASCII character.
377
+ #
378
+ # @return [Expr]
379
+ #
380
+ # @example
381
+ # df = Polars::DataFrame.new({"a" => ["cow", "monkey", nil, "hippopotamus"]})
382
+ # df.select(Polars.col("a").str.ljust(8, "*"))
383
+ # # =>
384
+ # # shape: (4, 1)
385
+ # # ┌──────────────┐
386
+ # # │ a │
387
+ # # │ --- │
388
+ # # │ str │
389
+ # # ╞══════════════╡
390
+ # # │ cow***** │
391
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
392
+ # # │ monkey** │
393
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
394
+ # # │ null │
395
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
396
+ # # │ hippopotamus │
397
+ # # └──────────────┘
57
398
  def ljust(width, fillchar = " ")
58
399
  Utils.wrap_expr(_rbexpr.str_ljust(width, fillchar))
59
400
  end
60
401
 
402
+ # Return the string right justified in a string of length `width`.
403
+ #
404
+ # Padding is done using the specified `fillchar`.
405
+ # The original string is returned if `width` is less than or equal to
406
+ # `s.length`.
407
+ #
408
+ # @param width [Integer]
409
+ # Justify right to this length.
410
+ # @param fillchar [String]
411
+ # Fill with this ASCII character.
412
+ #
413
+ # @return [Expr]
414
+ #
415
+ # @example
416
+ # df = Polars::DataFrame.new({"a" => ["cow", "monkey", nil, "hippopotamus"]})
417
+ # df.select(Polars.col("a").str.rjust(8, "*"))
418
+ # # =>
419
+ # # shape: (4, 1)
420
+ # # ┌──────────────┐
421
+ # # │ a │
422
+ # # │ --- │
423
+ # # │ str │
424
+ # # ╞══════════════╡
425
+ # # │ *****cow │
426
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
427
+ # # │ **monkey │
428
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
429
+ # # │ null │
430
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
431
+ # # │ hippopotamus │
432
+ # # └──────────────┘
61
433
  def rjust(width, fillchar = " ")
62
434
  Utils.wrap_expr(_rbexpr.str_rjust(width, fillchar))
63
435
  end
64
436
 
437
+ # Check if string contains a substring that matches a regex.
438
+ #
439
+ # @param pattern [String]
440
+ # A valid regex pattern.
441
+ # @param literal [Boolean]
442
+ # Treat pattern as a literal string.
443
+ #
444
+ # @return [Expr]
445
+ #
446
+ # @example
447
+ # df = Polars::DataFrame.new({"a" => ["Crab", "cat and dog", "rab$bit", nil]})
448
+ # df.select(
449
+ # [
450
+ # Polars.col("a"),
451
+ # Polars.col("a").str.contains("cat|bit").alias("regex"),
452
+ # Polars.col("a").str.contains("rab$", literal: true).alias("literal")
453
+ # ]
454
+ # )
455
+ # # =>
456
+ # # shape: (4, 3)
457
+ # # ┌─────────────┬───────┬─────────┐
458
+ # # │ a ┆ regex ┆ literal │
459
+ # # │ --- ┆ --- ┆ --- │
460
+ # # │ str ┆ bool ┆ bool │
461
+ # # ╞═════════════╪═══════╪═════════╡
462
+ # # │ Crab ┆ false ┆ false │
463
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
464
+ # # │ cat and dog ┆ true ┆ false │
465
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
466
+ # # │ rab$bit ┆ true ┆ true │
467
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
468
+ # # │ null ┆ null ┆ null │
469
+ # # └─────────────┴───────┴─────────┘
65
470
  def contains(pattern, literal: false)
66
471
  Utils.wrap_expr(_rbexpr.str_contains(pattern, literal))
67
472
  end
68
473
 
474
+ # Check if string values end with a substring.
475
+ #
476
+ # @param sub [String]
477
+ # Suffix substring.
478
+ #
479
+ # @return [Expr]
480
+ #
481
+ # @example
482
+ # df = Polars::DataFrame.new({"fruits" => ["apple", "mango", nil]})
483
+ # df.with_column(
484
+ # Polars.col("fruits").str.ends_with("go").alias("has_suffix")
485
+ # )
486
+ # # =>
487
+ # # shape: (3, 2)
488
+ # # ┌────────┬────────────┐
489
+ # # │ fruits ┆ has_suffix │
490
+ # # │ --- ┆ --- │
491
+ # # │ str ┆ bool │
492
+ # # ╞════════╪════════════╡
493
+ # # │ apple ┆ false │
494
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
495
+ # # │ mango ┆ true │
496
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
497
+ # # │ null ┆ null │
498
+ # # └────────┴────────────┘
499
+ #
500
+ # @example Using `ends_with` as a filter condition:
501
+ # df.filter(Polars.col("fruits").str.ends_with("go"))
502
+ # # =>
503
+ # # shape: (1, 1)
504
+ # # ┌────────┐
505
+ # # │ fruits │
506
+ # # │ --- │
507
+ # # │ str │
508
+ # # ╞════════╡
509
+ # # │ mango │
510
+ # # └────────┘
69
511
  def ends_with(sub)
70
512
  Utils.wrap_expr(_rbexpr.str_ends_with(sub))
71
513
  end
72
514
 
515
+ # Check if string values start with a substring.
516
+ #
517
+ # @param sub [String]
518
+ # Prefix substring.
519
+ #
520
+ # @return [Expr]
521
+ #
522
+ # @example
523
+ # df = Polars::DataFrame.new({"fruits" => ["apple", "mango", nil]})
524
+ # df.with_column(
525
+ # Polars.col("fruits").str.starts_with("app").alias("has_prefix")
526
+ # )
527
+ # # =>
528
+ # # shape: (3, 2)
529
+ # # ┌────────┬────────────┐
530
+ # # │ fruits ┆ has_prefix │
531
+ # # │ --- ┆ --- │
532
+ # # │ str ┆ bool │
533
+ # # ╞════════╪════════════╡
534
+ # # │ apple ┆ true │
535
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
536
+ # # │ mango ┆ false │
537
+ # # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
538
+ # # │ null ┆ null │
539
+ # # └────────┴────────────┘
540
+ #
541
+ # @example Using `starts_with` as a filter condition:
542
+ # df.filter(Polars.col("fruits").str.starts_with("app"))
543
+ # # =>
544
+ # # shape: (1, 1)
545
+ # # ┌────────┐
546
+ # # │ fruits │
547
+ # # │ --- │
548
+ # # │ str │
549
+ # # ╞════════╡
550
+ # # │ apple │
551
+ # # └────────┘
73
552
  def starts_with(sub)
74
553
  Utils.wrap_expr(_rbexpr.str_starts_with(sub))
75
554
  end
76
555
 
77
- # def json_path_match
78
- # end
556
+ # Extract the first match of json string with provided JSONPath expression.
557
+ #
558
+ # Throw errors if encounter invalid json strings.
559
+ # All return value will be casted to Utf8 regardless of the original value.
560
+ #
561
+ # Documentation on JSONPath standard can be found
562
+ # [here](https://goessner.net/articles/JsonPath/).
563
+ #
564
+ # @param json_path [String]
565
+ # A valid JSON path query string.
566
+ #
567
+ # @return [Expr]
568
+ #
569
+ # @example
570
+ # df = Polars::DataFrame.new(
571
+ # {"json_val" => ['{"a":"1"}', nil, '{"a":2}', '{"a":2.1}', '{"a":true}']}
572
+ # )
573
+ # df.select(Polars.col("json_val").str.json_path_match("$.a"))
574
+ # # =>
575
+ # # shape: (5, 1)
576
+ # # ┌──────────┐
577
+ # # │ json_val │
578
+ # # │ --- │
579
+ # # │ str │
580
+ # # ╞══════════╡
581
+ # # │ 1 │
582
+ # # ├╌╌╌╌╌╌╌╌╌╌┤
583
+ # # │ null │
584
+ # # ├╌╌╌╌╌╌╌╌╌╌┤
585
+ # # │ 2 │
586
+ # # ├╌╌╌╌╌╌╌╌╌╌┤
587
+ # # │ 2.1 │
588
+ # # ├╌╌╌╌╌╌╌╌╌╌┤
589
+ # # │ true │
590
+ # # └──────────┘
591
+ def json_path_match(json_path)
592
+ Utils.wrap_expr(_rbexpr.str_json_path_match(json_path))
593
+ end
79
594
 
80
- # def decode
81
- # end
595
+ # Decode a value using the provided encoding.
596
+ #
597
+ # @param encoding ["hex", "base64"]
598
+ # The encoding to use.
599
+ # @param strict [Boolean]
600
+ # How to handle invalid inputs:
601
+ #
602
+ # - `true`: An error will be thrown if unable to decode a value.
603
+ # - `false`: Unhandled values will be replaced with `nil`.
604
+ #
605
+ # @return [Expr]
606
+ #
607
+ # @example
608
+ # df = Polars::DataFrame.new({"encoded" => ["666f6f", "626172", nil]})
609
+ # df.select(Polars.col("encoded").str.decode("hex"))
610
+ # # =>
611
+ # # shape: (3, 1)
612
+ # # ┌─────────┐
613
+ # # │ encoded │
614
+ # # │ --- │
615
+ # # │ str │
616
+ # # ╞═════════╡
617
+ # # │ foo │
618
+ # # ├╌╌╌╌╌╌╌╌╌┤
619
+ # # │ bar │
620
+ # # ├╌╌╌╌╌╌╌╌╌┤
621
+ # # │ null │
622
+ # # └─────────┘
623
+ def decode(encoding, strict: false)
624
+ if encoding == "hex"
625
+ Utils.wrap_expr(_rbexpr.str_hex_decode(strict))
626
+ elsif encoding == "base64"
627
+ Utils.wrap_expr(_rbexpr.str_base64_decode(strict))
628
+ else
629
+ raise ArgumentError, "encoding must be one of {{'hex', 'base64'}}, got #{encoding}"
630
+ end
631
+ end
82
632
 
83
- # def encode
84
- # end
633
+ # Encode a value using the provided encoding.
634
+ #
635
+ # @param encoding ["hex", "base64"]
636
+ # The encoding to use.
637
+ #
638
+ # @return [Expr]
639
+ #
640
+ # @example
641
+ # df = Polars::DataFrame.new({"strings" => ["foo", "bar", nil]})
642
+ # df.select(Polars.col("strings").str.encode("hex"))
643
+ # # =>
644
+ # # shape: (3, 1)
645
+ # # ┌─────────┐
646
+ # # │ strings │
647
+ # # │ --- │
648
+ # # │ str │
649
+ # # ╞═════════╡
650
+ # # │ 666f6f │
651
+ # # ├╌╌╌╌╌╌╌╌╌┤
652
+ # # │ 626172 │
653
+ # # ├╌╌╌╌╌╌╌╌╌┤
654
+ # # │ null │
655
+ # # └─────────┘
656
+ def encode(encoding)
657
+ if encoding == "hex"
658
+ Utils.wrap_expr(_rbexpr.str_hex_encode)
659
+ elsif encoding == "base64"
660
+ Utils.wrap_expr(_rbexpr.str_base64_encode)
661
+ else
662
+ raise ArgumentError, "encoding must be one of {{'hex', 'base64'}}, got #{encoding}"
663
+ end
664
+ end
85
665
 
666
+ # Extract the target capture group from provided patterns.
667
+ #
668
+ # @param pattern [String]
669
+ # A valid regex pattern
670
+ # @param group_index [Integer]
671
+ # Index of the targeted capture group.
672
+ # Group 0 mean the whole pattern, first group begin at index 1
673
+ # Default to the first capture group
674
+ #
675
+ # @return [Expr]
676
+ #
677
+ # @example
678
+ # df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
679
+ # df.select(
680
+ # [
681
+ # Polars.col("foo").str.extract('(\d+)')
682
+ # ]
683
+ # )
684
+ # # =>
685
+ # # shape: (2, 1)
686
+ # # ┌─────┐
687
+ # # │ foo │
688
+ # # │ --- │
689
+ # # │ str │
690
+ # # ╞═════╡
691
+ # # │ 123 │
692
+ # # ├╌╌╌╌╌┤
693
+ # # │ 678 │
694
+ # # └─────┘
86
695
  def extract(pattern, group_index: 1)
87
696
  Utils.wrap_expr(_rbexpr.str_extract(pattern, group_index))
88
697
  end
89
698
 
699
+ # Extracts all matches for the given regex pattern.
700
+ #
701
+ # Extracts each successive non-overlapping regex match in an individual string as
702
+ # an array.
703
+ #
704
+ # @param pattern [String]
705
+ # A valid regex pattern
706
+ #
707
+ # @return [Expr]
708
+ #
709
+ # @example
710
+ # df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
711
+ # df.select(
712
+ # [
713
+ # Polars.col("foo").str.extract_all('(\d+)').alias("extracted_nrs")
714
+ # ]
715
+ # )
716
+ # # =>
717
+ # # shape: (2, 1)
718
+ # # ┌────────────────┐
719
+ # # │ extracted_nrs │
720
+ # # │ --- │
721
+ # # │ list[str] │
722
+ # # ╞════════════════╡
723
+ # # │ ["123", "45"] │
724
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
725
+ # # │ ["678", "910"] │
726
+ # # └────────────────┘
90
727
  def extract_all(pattern)
91
728
  Utils.wrap_expr(_rbexpr.str_extract_all(pattern))
92
729
  end
93
730
 
731
+ # Count all successive non-overlapping regex matches.
732
+ #
733
+ # @param pattern [String]
734
+ # A valid regex pattern
735
+ #
736
+ # @return [Expr]
737
+ #
738
+ # @example
739
+ # df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
740
+ # df.select(
741
+ # [
742
+ # Polars.col("foo").str.count_match('\d').alias("count_digits")
743
+ # ]
744
+ # )
745
+ # # =>
746
+ # # shape: (2, 1)
747
+ # # ┌──────────────┐
748
+ # # │ count_digits │
749
+ # # │ --- │
750
+ # # │ u32 │
751
+ # # ╞══════════════╡
752
+ # # │ 5 │
753
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
754
+ # # │ 6 │
755
+ # # └──────────────┘
94
756
  def count_match(pattern)
95
757
  Utils.wrap_expr(_rbexpr.count_match(pattern))
96
758
  end
97
759
 
760
+ # Split the string by a substring.
761
+ #
762
+ # @param by [String]
763
+ # Substring to split by.
764
+ # @param inclusive [Boolean]
765
+ # If true, include the split character/string in the results.
766
+ #
767
+ # @return [Expr]
768
+ #
769
+ # @example
770
+ # df = Polars::DataFrame.new({"s" => ["foo bar", "foo-bar", "foo bar baz"]})
771
+ # df.select(Polars.col("s").str.split(" "))
772
+ # # =>
773
+ # # shape: (3, 1)
774
+ # # ┌───────────────────────┐
775
+ # # │ s │
776
+ # # │ --- │
777
+ # # │ list[str] │
778
+ # # ╞═══════════════════════╡
779
+ # # │ ["foo", "bar"] │
780
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
781
+ # # │ ["foo-bar"] │
782
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
783
+ # # │ ["foo", "bar", "baz"] │
784
+ # # └───────────────────────┘
98
785
  def split(by, inclusive: false)
99
786
  if inclusive
100
787
  Utils.wrap_expr(_rbexpr.str_split_inclusive(by))
@@ -103,6 +790,43 @@ module Polars
103
790
  end
104
791
  end
105
792
 
793
+ # Split the string by a substring using `n` splits.
794
+ #
795
+ # Results in a struct of `n+1` fields.
796
+ #
797
+ # If it cannot make `n` splits, the remaining field elements will be null.
798
+ #
799
+ # @param by [String]
800
+ # Substring to split by.
801
+ # @param n [Integer]
802
+ # Number of splits to make.
803
+ # @param inclusive [Boolean]
804
+ # If true, include the split character/string in the results.
805
+ #
806
+ # @return [Expr]
807
+ #
808
+ # @example
809
+ # df = Polars::DataFrame.new({"x" => ["a_1", nil, "c", "d_4"]})
810
+ # df.select(
811
+ # [
812
+ # Polars.col("x").str.split_exact("_", 1).alias("fields")
813
+ # ]
814
+ # )
815
+ # # =>
816
+ # # shape: (4, 1)
817
+ # # ┌─────────────┐
818
+ # # │ fields │
819
+ # # │ --- │
820
+ # # │ struct[2] │
821
+ # # ╞═════════════╡
822
+ # # │ {"a","1"} │
823
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┤
824
+ # # │ {null,null} │
825
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┤
826
+ # # │ {"c",null} │
827
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┤
828
+ # # │ {"d","4"} │
829
+ # # └─────────────┘
106
830
  def split_exact(by, n, inclusive: false)
107
831
  if inclusive
108
832
  Utils.wrap_expr(_rbexpr.str_split_exact_inclusive(by, n))
@@ -111,22 +835,135 @@ module Polars
111
835
  end
112
836
  end
113
837
 
838
+ # Split the string by a substring, restricted to returning at most `n` items.
839
+ #
840
+ # If the number of possible splits is less than `n-1`, the remaining field
841
+ # elements will be null. If the number of possible splits is `n-1` or greater,
842
+ # the last (nth) substring will contain the remainder of the string.
843
+ #
844
+ # @param by [String]
845
+ # Substring to split by.
846
+ # @param n [Integer]
847
+ # Max number of items to return.
848
+ #
849
+ # @return [Expr]
850
+ #
851
+ # @example
852
+ # df = Polars::DataFrame.new({"s" => ["foo bar", nil, "foo-bar", "foo bar baz"]})
853
+ # df.select(Polars.col("s").str.splitn(" ", 2).alias("fields"))
854
+ # # =>
855
+ # # shape: (4, 1)
856
+ # # ┌───────────────────┐
857
+ # # │ fields │
858
+ # # │ --- │
859
+ # # │ struct[2] │
860
+ # # ╞═══════════════════╡
861
+ # # │ {"foo","bar"} │
862
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
863
+ # # │ {null,null} │
864
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
865
+ # # │ {"foo-bar",null} │
866
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
867
+ # # │ {"foo","bar baz"} │
868
+ # # └───────────────────┘
114
869
  def splitn(by, n)
115
870
  Utils.wrap_expr(_rbexpr.str_splitn(by, n))
116
871
  end
117
872
 
118
- def replace(pattern, literal: false)
873
+ # Replace first matching regex/literal substring with a new string value.
874
+ #
875
+ # @param pattern [String]
876
+ # Regex pattern.
877
+ # @param value [String]
878
+ # Replacement string.
879
+ # @param literal [Boolean]
880
+ # Treat pattern as a literal string.
881
+ #
882
+ # @return [Expr]
883
+ #
884
+ # @example
885
+ # df = Polars::DataFrame.new({"id" => [1, 2], "text" => ["123abc", "abc456"]})
886
+ # df.with_column(
887
+ # Polars.col("text").str.replace('abc\b', "ABC")
888
+ # )
889
+ # # =>
890
+ # # shape: (2, 2)
891
+ # # ┌─────┬────────┐
892
+ # # │ id ┆ text │
893
+ # # │ --- ┆ --- │
894
+ # # │ i64 ┆ str │
895
+ # # ╞═════╪════════╡
896
+ # # │ 1 ┆ 123ABC │
897
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
898
+ # # │ 2 ┆ abc456 │
899
+ # # └─────┴────────┘
900
+ def replace(pattern, value, literal: false)
119
901
  pattern = Utils.expr_to_lit_or_expr(pattern, str_to_lit: true)
120
902
  value = Utils.expr_to_lit_or_expr(value, str_to_lit: true)
121
903
  Utils.wrap_expr(_rbexpr.str_replace(pattern._rbexpr, value._rbexpr, literal))
122
904
  end
123
905
 
124
- def replace_all(pattern, literal: false)
906
+ # Replace all matching regex/literal substrings with a new string value.
907
+ #
908
+ # @param pattern [String]
909
+ # Regex pattern.
910
+ # @param value [String]
911
+ # Replacement string.
912
+ # @param literal [Boolean]
913
+ # Treat pattern as a literal string.
914
+ #
915
+ # @return [Expr]
916
+ #
917
+ # @example
918
+ # df = Polars::DataFrame.new({"id" => [1, 2], "text" => ["abcabc", "123a123"]})
919
+ # df.with_column(Polars.col("text").str.replace_all("a", "-"))
920
+ # # =>
921
+ # # shape: (2, 2)
922
+ # # ┌─────┬─────────┐
923
+ # # │ id ┆ text │
924
+ # # │ --- ┆ --- │
925
+ # # │ i64 ┆ str │
926
+ # # ╞═════╪═════════╡
927
+ # # │ 1 ┆ -bc-bc │
928
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
929
+ # # │ 2 ┆ 123-123 │
930
+ # # └─────┴─────────┘
931
+ def replace_all(pattern, value, literal: false)
125
932
  pattern = Utils.expr_to_lit_or_expr(pattern, str_to_lit: true)
126
933
  value = Utils.expr_to_lit_or_expr(value, str_to_lit: true)
127
934
  Utils.wrap_expr(_rbexpr.str_replace_all(pattern._rbexpr, value._rbexpr, literal))
128
935
  end
129
936
 
937
+ # Create subslices of the string values of a Utf8 Series.
938
+ #
939
+ # @param offset [Integer]
940
+ # Start index. Negative indexing is supported.
941
+ # @param length [Integer]
942
+ # Length of the slice. If set to `nil` (default), the slice is taken to the
943
+ # end of the string.
944
+ #
945
+ # @return [Expr]
946
+ #
947
+ # @example
948
+ # df = Polars::DataFrame.new({"s" => ["pear", nil, "papaya", "dragonfruit"]})
949
+ # df.with_column(
950
+ # Polars.col("s").str.slice(-3).alias("s_sliced")
951
+ # )
952
+ # # =>
953
+ # # shape: (4, 2)
954
+ # # ┌─────────────┬──────────┐
955
+ # # │ s ┆ s_sliced │
956
+ # # │ --- ┆ --- │
957
+ # # │ str ┆ str │
958
+ # # ╞═════════════╪══════════╡
959
+ # # │ pear ┆ ear │
960
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
961
+ # # │ null ┆ null │
962
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
963
+ # # │ papaya ┆ aya │
964
+ # # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
965
+ # # │ dragonfruit ┆ uit │
966
+ # # └─────────────┴──────────┘
130
967
  def slice(offset, length = nil)
131
968
  Utils.wrap_expr(_rbexpr.str_slice(offset, length))
132
969
  end