polars-df 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +3 -0
- data/CHANGELOG.md +8 -0
- data/Cargo.lock +2 -1
- data/README.md +1 -1
- data/ext/polars/Cargo.toml +7 -1
- data/ext/polars/src/batched_csv.rs +120 -0
- data/ext/polars/src/conversion.rs +139 -6
- data/ext/polars/src/dataframe.rs +360 -15
- data/ext/polars/src/error.rs +9 -0
- data/ext/polars/src/file.rs +8 -7
- data/ext/polars/src/lazy/apply.rs +7 -0
- data/ext/polars/src/lazy/dataframe.rs +135 -3
- data/ext/polars/src/lazy/dsl.rs +97 -2
- data/ext/polars/src/lazy/meta.rs +1 -1
- data/ext/polars/src/lazy/mod.rs +1 -0
- data/ext/polars/src/lib.rs +227 -12
- data/ext/polars/src/series.rs +190 -38
- data/ext/polars/src/set.rs +91 -0
- data/ext/polars/src/utils.rs +19 -0
- data/lib/polars/batched_csv_reader.rb +96 -0
- data/lib/polars/cat_expr.rb +39 -0
- data/lib/polars/data_frame.rb +2813 -100
- data/lib/polars/date_time_expr.rb +1282 -7
- data/lib/polars/exceptions.rb +20 -0
- data/lib/polars/expr.rb +631 -11
- data/lib/polars/expr_dispatch.rb +14 -0
- data/lib/polars/functions.rb +219 -0
- data/lib/polars/group_by.rb +517 -0
- data/lib/polars/io.rb +763 -4
- data/lib/polars/lazy_frame.rb +1415 -67
- data/lib/polars/lazy_functions.rb +430 -9
- data/lib/polars/lazy_group_by.rb +79 -0
- data/lib/polars/list_expr.rb +5 -0
- data/lib/polars/meta_expr.rb +21 -0
- data/lib/polars/series.rb +2244 -192
- data/lib/polars/slice.rb +104 -0
- data/lib/polars/string_expr.rb +663 -2
- data/lib/polars/struct_expr.rb +73 -0
- data/lib/polars/utils.rb +76 -3
- data/lib/polars/version.rb +2 -1
- data/lib/polars/when.rb +1 -0
- data/lib/polars/when_then.rb +1 -0
- data/lib/polars.rb +8 -2
- metadata +12 -2
data/lib/polars/string_expr.rb
CHANGED
@@ -1,7 +1,10 @@
|
|
1
1
|
module Polars
|
2
|
+
# Namespace for string related expressions.
|
2
3
|
class StringExpr
|
4
|
+
# @private
|
3
5
|
attr_accessor :_rbexpr
|
4
6
|
|
7
|
+
# @private
|
5
8
|
def initialize(expr)
|
6
9
|
self._rbexpr = expr._rbexpr
|
7
10
|
end
|
@@ -9,26 +12,166 @@ module Polars
|
|
9
12
|
# def strptime
|
10
13
|
# end
|
11
14
|
|
15
|
+
# Get length of the strings as `:u32` (as number of bytes).
|
16
|
+
#
|
17
|
+
# @return [Expr]
|
18
|
+
#
|
19
|
+
# @note
|
20
|
+
# The returned lengths are equal to the number of bytes in the UTF8 string. If you
|
21
|
+
# need the length in terms of the number of characters, use `n_chars` instead.
|
22
|
+
#
|
23
|
+
# @example
|
24
|
+
# df = Polars::DataFrame.new({"s" => ["Café", nil, "345", "東京"]}).with_columns(
|
25
|
+
# [
|
26
|
+
# Polars.col("s").str.lengths.alias("length"),
|
27
|
+
# Polars.col("s").str.n_chars.alias("nchars")
|
28
|
+
# ]
|
29
|
+
# )
|
30
|
+
# df
|
31
|
+
# # =>
|
32
|
+
# # shape: (4, 3)
|
33
|
+
# # ┌──────┬────────┬────────┐
|
34
|
+
# # │ s ┆ length ┆ nchars │
|
35
|
+
# # │ --- ┆ --- ┆ --- │
|
36
|
+
# # │ str ┆ u32 ┆ u32 │
|
37
|
+
# # ╞══════╪════════╪════════╡
|
38
|
+
# # │ Café ┆ 5 ┆ 4 │
|
39
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
40
|
+
# # │ null ┆ null ┆ null │
|
41
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
42
|
+
# # │ 345 ┆ 3 ┆ 3 │
|
43
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
44
|
+
# # │ 東京 ┆ 6 ┆ 2 │
|
45
|
+
# # └──────┴────────┴────────┘
|
12
46
|
def lengths
|
13
47
|
Utils.wrap_expr(_rbexpr.str_lengths)
|
14
48
|
end
|
15
49
|
|
50
|
+
# Get length of the strings as `:u32` (as number of chars).
|
51
|
+
#
|
52
|
+
# @return [Expr]
|
53
|
+
#
|
54
|
+
# @note
|
55
|
+
# If you know that you are working with ASCII text, `lengths` will be
|
56
|
+
# equivalent, and faster (returns length in terms of the number of bytes).
|
57
|
+
#
|
58
|
+
# @example
|
59
|
+
# df = Polars::DataFrame.new({"s" => ["Café", nil, "345", "東京"]}).with_columns(
|
60
|
+
# [
|
61
|
+
# Polars.col("s").str.lengths.alias("length"),
|
62
|
+
# Polars.col("s").str.n_chars.alias("nchars")
|
63
|
+
# ]
|
64
|
+
# )
|
65
|
+
# df
|
66
|
+
# # =>
|
67
|
+
# # shape: (4, 3)
|
68
|
+
# # ┌──────┬────────┬────────┐
|
69
|
+
# # │ s ┆ length ┆ nchars │
|
70
|
+
# # │ --- ┆ --- ┆ --- │
|
71
|
+
# # │ str ┆ u32 ┆ u32 │
|
72
|
+
# # ╞══════╪════════╪════════╡
|
73
|
+
# # │ Café ┆ 5 ┆ 4 │
|
74
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
75
|
+
# # │ null ┆ null ┆ null │
|
76
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
77
|
+
# # │ 345 ┆ 3 ┆ 3 │
|
78
|
+
# # ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
79
|
+
# # │ 東京 ┆ 6 ┆ 2 │
|
80
|
+
# # └──────┴────────┴────────┘
|
16
81
|
def n_chars
|
17
82
|
Utils.wrap_expr(_rbexpr.str_n_chars)
|
18
83
|
end
|
19
84
|
|
85
|
+
# Vertically concat the values in the Series to a single string value.
|
86
|
+
#
|
87
|
+
# @param delimiter [String]
|
88
|
+
# The delimiter to insert between consecutive string values.
|
89
|
+
#
|
90
|
+
# @return [Expr]
|
91
|
+
#
|
92
|
+
# @example
|
93
|
+
# df = Polars::DataFrame.new({"foo" => [1, nil, 2]})
|
94
|
+
# df.select(Polars.col("foo").str.concat("-"))
|
95
|
+
# # =>
|
96
|
+
# # shape: (1, 1)
|
97
|
+
# # ┌──────────┐
|
98
|
+
# # │ foo │
|
99
|
+
# # │ --- │
|
100
|
+
# # │ str │
|
101
|
+
# # ╞══════════╡
|
102
|
+
# # │ 1-null-2 │
|
103
|
+
# # └──────────┘
|
20
104
|
def concat(delimiter = "-")
|
21
105
|
Utils.wrap_expr(_rbexpr.str_concat(delimiter))
|
22
106
|
end
|
23
107
|
|
108
|
+
# Transform to uppercase variant.
|
109
|
+
#
|
110
|
+
# @return [Expr]
|
111
|
+
#
|
112
|
+
# @example
|
113
|
+
# df = Polars::DataFrame.new({"foo" => ["cat", "dog"]})
|
114
|
+
# df.select(Polars.col("foo").str.to_uppercase)
|
115
|
+
# # =>
|
116
|
+
# # shape: (2, 1)
|
117
|
+
# # ┌─────┐
|
118
|
+
# # │ foo │
|
119
|
+
# # │ --- │
|
120
|
+
# # │ str │
|
121
|
+
# # ╞═════╡
|
122
|
+
# # │ CAT │
|
123
|
+
# # ├╌╌╌╌╌┤
|
124
|
+
# # │ DOG │
|
125
|
+
# # └─────┘
|
24
126
|
def to_uppercase
|
25
127
|
Utils.wrap_expr(_rbexpr.str_to_uppercase)
|
26
128
|
end
|
27
129
|
|
130
|
+
# Transform to lowercase variant.
|
131
|
+
#
|
132
|
+
# @return [Expr]
|
133
|
+
#
|
134
|
+
# @example
|
135
|
+
# df = Polars::DataFrame.new({"foo" => ["CAT", "DOG"]})
|
136
|
+
# df.select(Polars.col("foo").str.to_lowercase)
|
137
|
+
# # =>
|
138
|
+
# # shape: (2, 1)
|
139
|
+
# # ┌─────┐
|
140
|
+
# # │ foo │
|
141
|
+
# # │ --- │
|
142
|
+
# # │ str │
|
143
|
+
# # ╞═════╡
|
144
|
+
# # │ cat │
|
145
|
+
# # ├╌╌╌╌╌┤
|
146
|
+
# # │ dog │
|
147
|
+
# # └─────┘
|
28
148
|
def to_lowercase
|
29
149
|
Utils.wrap_expr(_rbexpr.str_to_lowercase)
|
30
150
|
end
|
31
151
|
|
152
|
+
# Remove leading and trailing whitespace.
|
153
|
+
#
|
154
|
+
# @param matches [String, nil]
|
155
|
+
# An optional single character that should be trimmed.
|
156
|
+
#
|
157
|
+
# @return [Expr]
|
158
|
+
#
|
159
|
+
# @example
|
160
|
+
# df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
|
161
|
+
# df.select(Polars.col("foo").str.strip)
|
162
|
+
# # =>
|
163
|
+
# # shape: (3, 1)
|
164
|
+
# # ┌───────┐
|
165
|
+
# # │ foo │
|
166
|
+
# # │ --- │
|
167
|
+
# # │ str │
|
168
|
+
# # ╞═══════╡
|
169
|
+
# # │ lead │
|
170
|
+
# # ├╌╌╌╌╌╌╌┤
|
171
|
+
# # │ trail │
|
172
|
+
# # ├╌╌╌╌╌╌╌┤
|
173
|
+
# # │ both │
|
174
|
+
# # └───────┘
|
32
175
|
def strip(matches = nil)
|
33
176
|
if !matches.nil? && matches.length > 1
|
34
177
|
raise ArgumentError, "matches should contain a single character"
|
@@ -36,6 +179,29 @@ module Polars
|
|
36
179
|
Utils.wrap_expr(_rbexpr.str_strip(matches))
|
37
180
|
end
|
38
181
|
|
182
|
+
# Remove leading whitespace.
|
183
|
+
#
|
184
|
+
# @param matches [String, nil]
|
185
|
+
# An optional single character that should be trimmed.
|
186
|
+
#
|
187
|
+
# @return [Expr]
|
188
|
+
#
|
189
|
+
# @example
|
190
|
+
# df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
|
191
|
+
# df.select(Polars.col("foo").str.lstrip)
|
192
|
+
# # =>
|
193
|
+
# # shape: (3, 1)
|
194
|
+
# # ┌────────┐
|
195
|
+
# # │ foo │
|
196
|
+
# # │ --- │
|
197
|
+
# # │ str │
|
198
|
+
# # ╞════════╡
|
199
|
+
# # │ lead │
|
200
|
+
# # ├╌╌╌╌╌╌╌╌┤
|
201
|
+
# # │ trail │
|
202
|
+
# # ├╌╌╌╌╌╌╌╌┤
|
203
|
+
# # │ both │
|
204
|
+
# # └────────┘
|
39
205
|
def lstrip(matches = nil)
|
40
206
|
if !matches.nil? && matches.length > 1
|
41
207
|
raise ArgumentError, "matches should contain a single character"
|
@@ -43,6 +209,29 @@ module Polars
|
|
43
209
|
Utils.wrap_expr(_rbexpr.str_lstrip(matches))
|
44
210
|
end
|
45
211
|
|
212
|
+
# Remove trailing whitespace.
|
213
|
+
#
|
214
|
+
# @param matches [String, nil]
|
215
|
+
# An optional single character that should be trimmed.
|
216
|
+
#
|
217
|
+
# @return [Expr]
|
218
|
+
#
|
219
|
+
# @example
|
220
|
+
# df = Polars::DataFrame.new({"foo" => [" lead", "trail ", " both "]})
|
221
|
+
# df.select(Polars.col("foo").str.rstrip)
|
222
|
+
# # =>
|
223
|
+
# # shape: (3, 1)
|
224
|
+
# # ┌───────┐
|
225
|
+
# # │ foo │
|
226
|
+
# # │ --- │
|
227
|
+
# # │ str │
|
228
|
+
# # ╞═══════╡
|
229
|
+
# # │ lead │
|
230
|
+
# # ├╌╌╌╌╌╌╌┤
|
231
|
+
# # │ trail │
|
232
|
+
# # ├╌╌╌╌╌╌╌┤
|
233
|
+
# # │ both │
|
234
|
+
# # └───────┘
|
46
235
|
def rstrip(matches = nil)
|
47
236
|
if !matches.nil? && matches.length > 1
|
48
237
|
raise ArgumentError, "matches should contain a single character"
|
@@ -50,26 +239,241 @@ module Polars
|
|
50
239
|
Utils.wrap_expr(_rbexpr.str_rstrip(matches))
|
51
240
|
end
|
52
241
|
|
242
|
+
# Fills the string with zeroes.
|
243
|
+
#
|
244
|
+
# Return a copy of the string left filled with ASCII '0' digits to make a string
|
245
|
+
# of length width.
|
246
|
+
#
|
247
|
+
# A leading sign prefix ('+'/'-') is handled by inserting the padding after the
|
248
|
+
# sign character rather than before. The original string is returned if width is
|
249
|
+
# less than or equal to `s.length`.
|
250
|
+
#
|
251
|
+
# @param alignment [Integer]
|
252
|
+
# Fill the value up to this length
|
253
|
+
#
|
254
|
+
# @return [Expr]
|
255
|
+
#
|
256
|
+
# @example
|
257
|
+
# df = Polars::DataFrame.new(
|
258
|
+
# {
|
259
|
+
# "num" => [-10, -1, 0, 1, 10, 100, 1000, 10000, 100000, 1000000, nil]
|
260
|
+
# }
|
261
|
+
# )
|
262
|
+
# df.with_column(Polars.col("num").cast(String).str.zfill(5))
|
263
|
+
# # =>
|
264
|
+
# # shape: (11, 1)
|
265
|
+
# # ┌─────────┐
|
266
|
+
# # │ num │
|
267
|
+
# # │ --- │
|
268
|
+
# # │ str │
|
269
|
+
# # ╞═════════╡
|
270
|
+
# # │ -0010 │
|
271
|
+
# # ├╌╌╌╌╌╌╌╌╌┤
|
272
|
+
# # │ -0001 │
|
273
|
+
# # ├╌╌╌╌╌╌╌╌╌┤
|
274
|
+
# # │ 00000 │
|
275
|
+
# # ├╌╌╌╌╌╌╌╌╌┤
|
276
|
+
# # │ 00001 │
|
277
|
+
# # ├╌╌╌╌╌╌╌╌╌┤
|
278
|
+
# # │ ... │
|
279
|
+
# # ├╌╌╌╌╌╌╌╌╌┤
|
280
|
+
# # │ 10000 │
|
281
|
+
# # ├╌╌╌╌╌╌╌╌╌┤
|
282
|
+
# # │ 100000 │
|
283
|
+
# # ├╌╌╌╌╌╌╌╌╌┤
|
284
|
+
# # │ 1000000 │
|
285
|
+
# # ├╌╌╌╌╌╌╌╌╌┤
|
286
|
+
# # │ null │
|
287
|
+
# # └─────────┘
|
53
288
|
def zfill(alignment)
|
54
289
|
Utils.wrap_expr(_rbexpr.str_zfill(alignment))
|
55
290
|
end
|
56
291
|
|
292
|
+
# Return the string left justified in a string of length `width`.
|
293
|
+
#
|
294
|
+
# Padding is done using the specified `fillcha``.
|
295
|
+
# The original string is returned if `width` is less than or equal to
|
296
|
+
# `s.length`.
|
297
|
+
#
|
298
|
+
# @param width [Integer]
|
299
|
+
# Justify left to this length.
|
300
|
+
# @param fillchar [String]
|
301
|
+
# Fill with this ASCII character.
|
302
|
+
#
|
303
|
+
# @return [Expr]
|
304
|
+
#
|
305
|
+
# @example
|
306
|
+
# df = Polars::DataFrame.new({"a" => ["cow", "monkey", nil, "hippopotamus"]})
|
307
|
+
# df.select(Polars.col("a").str.ljust(8, "*"))
|
308
|
+
# # =>
|
309
|
+
# # shape: (4, 1)
|
310
|
+
# # ┌──────────────┐
|
311
|
+
# # │ a │
|
312
|
+
# # │ --- │
|
313
|
+
# # │ str │
|
314
|
+
# # ╞══════════════╡
|
315
|
+
# # │ cow***** │
|
316
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
317
|
+
# # │ monkey** │
|
318
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
319
|
+
# # │ null │
|
320
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
321
|
+
# # │ hippopotamus │
|
322
|
+
# # └──────────────┘
|
57
323
|
def ljust(width, fillchar = " ")
|
58
324
|
Utils.wrap_expr(_rbexpr.str_ljust(width, fillchar))
|
59
325
|
end
|
60
326
|
|
327
|
+
# Return the string right justified in a string of length ``width``.
|
328
|
+
#
|
329
|
+
# Padding is done using the specified `fillchar`.
|
330
|
+
# The original string is returned if `width` is less than or equal to
|
331
|
+
# `s.length`.
|
332
|
+
#
|
333
|
+
# @param width [Integer]
|
334
|
+
# Justify right to this length.
|
335
|
+
# @param fillchar [String]
|
336
|
+
# Fill with this ASCII character.
|
337
|
+
#
|
338
|
+
# @return [Expr]
|
339
|
+
#
|
340
|
+
# @example
|
341
|
+
# df = Polars::DataFrame.new({"a" => ["cow", "monkey", nil, "hippopotamus"]})
|
342
|
+
# df.select(Polars.col("a").str.rjust(8, "*"))
|
343
|
+
# # =>
|
344
|
+
# # shape: (4, 1)
|
345
|
+
# # ┌──────────────┐
|
346
|
+
# # │ a │
|
347
|
+
# # │ --- │
|
348
|
+
# # │ str │
|
349
|
+
# # ╞══════════════╡
|
350
|
+
# # │ *****cow │
|
351
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
352
|
+
# # │ **monkey │
|
353
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
354
|
+
# # │ null │
|
355
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
356
|
+
# # │ hippopotamus │
|
357
|
+
# # └──────────────┘
|
61
358
|
def rjust(width, fillchar = " ")
|
62
359
|
Utils.wrap_expr(_rbexpr.str_rjust(width, fillchar))
|
63
360
|
end
|
64
361
|
|
362
|
+
# Check if string contains a substring that matches a regex.
|
363
|
+
#
|
364
|
+
# @param pattern [String]
|
365
|
+
# A valid regex pattern.
|
366
|
+
# @param literal [Boolean]
|
367
|
+
# Treat pattern as a literal string.
|
368
|
+
#
|
369
|
+
# @return [Expr]
|
370
|
+
#
|
371
|
+
# @example
|
372
|
+
# df = Polars::DataFrame.new({"a" => ["Crab", "cat and dog", "rab$bit", nil]})
|
373
|
+
# df.select(
|
374
|
+
# [
|
375
|
+
# Polars.col("a"),
|
376
|
+
# Polars.col("a").str.contains("cat|bit").alias("regex"),
|
377
|
+
# Polars.col("a").str.contains("rab$", literal: true).alias("literal")
|
378
|
+
# ]
|
379
|
+
# )
|
380
|
+
# # =>
|
381
|
+
# # shape: (4, 3)
|
382
|
+
# # ┌─────────────┬───────┬─────────┐
|
383
|
+
# # │ a ┆ regex ┆ literal │
|
384
|
+
# # │ --- ┆ --- ┆ --- │
|
385
|
+
# # │ str ┆ bool ┆ bool │
|
386
|
+
# # ╞═════════════╪═══════╪═════════╡
|
387
|
+
# # │ Crab ┆ false ┆ false │
|
388
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
389
|
+
# # │ cat and dog ┆ true ┆ false │
|
390
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
391
|
+
# # │ rab$bit ┆ true ┆ true │
|
392
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
393
|
+
# # │ null ┆ null ┆ null │
|
394
|
+
# # └─────────────┴───────┴─────────┘
|
65
395
|
def contains(pattern, literal: false)
|
66
396
|
Utils.wrap_expr(_rbexpr.str_contains(pattern, literal))
|
67
397
|
end
|
68
398
|
|
399
|
+
# Check if string values end with a substring.
|
400
|
+
#
|
401
|
+
# @param sub [String]
|
402
|
+
# Suffix substring.
|
403
|
+
#
|
404
|
+
# @return [Expr]
|
405
|
+
#
|
406
|
+
# @example
|
407
|
+
# df = Polars::DataFrame.new({"fruits" => ["apple", "mango", nil]})
|
408
|
+
# df.with_column(
|
409
|
+
# Polars.col("fruits").str.ends_with("go").alias("has_suffix")
|
410
|
+
# )
|
411
|
+
# # =>
|
412
|
+
# # shape: (3, 2)
|
413
|
+
# # ┌────────┬────────────┐
|
414
|
+
# # │ fruits ┆ has_suffix │
|
415
|
+
# # │ --- ┆ --- │
|
416
|
+
# # │ str ┆ bool │
|
417
|
+
# # ╞════════╪════════════╡
|
418
|
+
# # │ apple ┆ false │
|
419
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
420
|
+
# # │ mango ┆ true │
|
421
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
422
|
+
# # │ null ┆ null │
|
423
|
+
# # └────────┴────────────┘
|
424
|
+
#
|
425
|
+
# @example Using `ends_with` as a filter condition:
|
426
|
+
# df.filter(Polars.col("fruits").str.ends_with("go"))
|
427
|
+
# # =>
|
428
|
+
# # shape: (1, 1)
|
429
|
+
# # ┌────────┐
|
430
|
+
# # │ fruits │
|
431
|
+
# # │ --- │
|
432
|
+
# # │ str │
|
433
|
+
# # ╞════════╡
|
434
|
+
# # │ mango │
|
435
|
+
# # └────────┘
|
69
436
|
def ends_with(sub)
|
70
437
|
Utils.wrap_expr(_rbexpr.str_ends_with(sub))
|
71
438
|
end
|
72
439
|
|
440
|
+
# Check if string values start with a substring.
|
441
|
+
#
|
442
|
+
# @param sub [String]
|
443
|
+
# Prefix substring.
|
444
|
+
#
|
445
|
+
# @return [Expr]
|
446
|
+
#
|
447
|
+
# @example
|
448
|
+
# df = Polars::DataFrame.new({"fruits" => ["apple", "mango", nil]})
|
449
|
+
# df.with_column(
|
450
|
+
# Polars.col("fruits").str.starts_with("app").alias("has_prefix")
|
451
|
+
# )
|
452
|
+
# # =>
|
453
|
+
# # shape: (3, 2)
|
454
|
+
# # ┌────────┬────────────┐
|
455
|
+
# # │ fruits ┆ has_prefix │
|
456
|
+
# # │ --- ┆ --- │
|
457
|
+
# # │ str ┆ bool │
|
458
|
+
# # ╞════════╪════════════╡
|
459
|
+
# # │ apple ┆ true │
|
460
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
461
|
+
# # │ mango ┆ false │
|
462
|
+
# # ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
|
463
|
+
# # │ null ┆ null │
|
464
|
+
# # └────────┴────────────┘
|
465
|
+
#
|
466
|
+
# @example Using `starts_with` as a filter condition:
|
467
|
+
# df.filter(Polars.col("fruits").str.starts_with("app"))
|
468
|
+
# # =>
|
469
|
+
# # shape: (1, 1)
|
470
|
+
# # ┌────────┐
|
471
|
+
# # │ fruits │
|
472
|
+
# # │ --- │
|
473
|
+
# # │ str │
|
474
|
+
# # ╞════════╡
|
475
|
+
# # │ apple │
|
476
|
+
# # └────────┘
|
73
477
|
def starts_with(sub)
|
74
478
|
Utils.wrap_expr(_rbexpr.str_starts_with(sub))
|
75
479
|
end
|
@@ -83,18 +487,125 @@ module Polars
|
|
83
487
|
# def encode
|
84
488
|
# end
|
85
489
|
|
490
|
+
# Extract the target capture group from provided patterns.
|
491
|
+
#
|
492
|
+
# @param pattern [String]
|
493
|
+
# A valid regex pattern
|
494
|
+
# @param group_index [Integer]
|
495
|
+
# Index of the targeted capture group.
|
496
|
+
# Group 0 mean the whole pattern, first group begin at index 1
|
497
|
+
# Default to the first capture group
|
498
|
+
#
|
499
|
+
# @return [Expr]
|
500
|
+
#
|
501
|
+
# @example
|
502
|
+
# df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
|
503
|
+
# df.select(
|
504
|
+
# [
|
505
|
+
# Polars.col("foo").str.extract('(\d+)')
|
506
|
+
# ]
|
507
|
+
# )
|
508
|
+
# # =>
|
509
|
+
# # shape: (2, 1)
|
510
|
+
# # ┌─────┐
|
511
|
+
# # │ foo │
|
512
|
+
# # │ --- │
|
513
|
+
# # │ str │
|
514
|
+
# # ╞═════╡
|
515
|
+
# # │ 123 │
|
516
|
+
# # ├╌╌╌╌╌┤
|
517
|
+
# # │ 678 │
|
518
|
+
# # └─────┘
|
86
519
|
def extract(pattern, group_index: 1)
|
87
520
|
Utils.wrap_expr(_rbexpr.str_extract(pattern, group_index))
|
88
521
|
end
|
89
522
|
|
523
|
+
# Extracts all matches for the given regex pattern.
|
524
|
+
#
|
525
|
+
# Extracts each successive non-overlapping regex match in an individual string as
|
526
|
+
# an array.
|
527
|
+
#
|
528
|
+
# @param pattern [String]
|
529
|
+
# A valid regex pattern
|
530
|
+
#
|
531
|
+
# @return [Expr]
|
532
|
+
#
|
533
|
+
# @example
|
534
|
+
# df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
|
535
|
+
# df.select(
|
536
|
+
# [
|
537
|
+
# Polars.col("foo").str.extract_all('(\d+)').alias("extracted_nrs")
|
538
|
+
# ]
|
539
|
+
# )
|
540
|
+
# # =>
|
541
|
+
# # shape: (2, 1)
|
542
|
+
# # ┌────────────────┐
|
543
|
+
# # │ extracted_nrs │
|
544
|
+
# # │ --- │
|
545
|
+
# # │ list[str] │
|
546
|
+
# # ╞════════════════╡
|
547
|
+
# # │ ["123", "45"] │
|
548
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
549
|
+
# # │ ["678", "910"] │
|
550
|
+
# # └────────────────┘
|
90
551
|
def extract_all(pattern)
|
91
552
|
Utils.wrap_expr(_rbexpr.str_extract_all(pattern))
|
92
553
|
end
|
93
554
|
|
555
|
+
# Count all successive non-overlapping regex matches.
|
556
|
+
#
|
557
|
+
# @param pattern [String]
|
558
|
+
# A valid regex pattern
|
559
|
+
#
|
560
|
+
# @return [Expr]
|
561
|
+
#
|
562
|
+
# @example
|
563
|
+
# df = Polars::DataFrame.new({"foo" => ["123 bla 45 asd", "xyz 678 910t"]})
|
564
|
+
# df.select(
|
565
|
+
# [
|
566
|
+
# Polars.col("foo").str.count_match('\d').alias("count_digits")
|
567
|
+
# ]
|
568
|
+
# )
|
569
|
+
# # =>
|
570
|
+
# # shape: (2, 1)
|
571
|
+
# # ┌──────────────┐
|
572
|
+
# # │ count_digits │
|
573
|
+
# # │ --- │
|
574
|
+
# # │ u32 │
|
575
|
+
# # ╞══════════════╡
|
576
|
+
# # │ 5 │
|
577
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
578
|
+
# # │ 6 │
|
579
|
+
# # └──────────────┘
|
94
580
|
def count_match(pattern)
|
95
581
|
Utils.wrap_expr(_rbexpr.count_match(pattern))
|
96
582
|
end
|
97
583
|
|
584
|
+
# Split the string by a substring.
|
585
|
+
#
|
586
|
+
# @param by [String]
|
587
|
+
# Substring to split by.
|
588
|
+
# @param inclusive [Boolean]
|
589
|
+
# If true, include the split character/string in the results.
|
590
|
+
#
|
591
|
+
# @return [Expr]
|
592
|
+
#
|
593
|
+
# @example
|
594
|
+
# df = Polars::DataFrame.new({"s" => ["foo bar", "foo-bar", "foo bar baz"]})
|
595
|
+
# df.select(Polars.col("s").str.split(" "))
|
596
|
+
# # =>
|
597
|
+
# # shape: (3, 1)
|
598
|
+
# # ┌───────────────────────┐
|
599
|
+
# # │ s │
|
600
|
+
# # │ --- │
|
601
|
+
# # │ list[str] │
|
602
|
+
# # ╞═══════════════════════╡
|
603
|
+
# # │ ["foo", "bar"] │
|
604
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
605
|
+
# # │ ["foo-bar"] │
|
606
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
607
|
+
# # │ ["foo", "bar", "baz"] │
|
608
|
+
# # └───────────────────────┘
|
98
609
|
def split(by, inclusive: false)
|
99
610
|
if inclusive
|
100
611
|
Utils.wrap_expr(_rbexpr.str_split_inclusive(by))
|
@@ -103,6 +614,43 @@ module Polars
|
|
103
614
|
end
|
104
615
|
end
|
105
616
|
|
617
|
+
# Split the string by a substring using `n` splits.
|
618
|
+
#
|
619
|
+
# Results in a struct of `n+1` fields.
|
620
|
+
#
|
621
|
+
# If it cannot make `n` splits, the remaining field elements will be null.
|
622
|
+
#
|
623
|
+
# @param by [String]
|
624
|
+
# Substring to split by.
|
625
|
+
# @param n [Integer]
|
626
|
+
# Number of splits to make.
|
627
|
+
# @param inclusive [Boolean]
|
628
|
+
# If true, include the split character/string in the results.
|
629
|
+
#
|
630
|
+
# @return [Expr]
|
631
|
+
#
|
632
|
+
# @example
|
633
|
+
# df = Polars::DataFrame.new({"x" => ["a_1", nil, "c", "d_4"]})
|
634
|
+
# df.select(
|
635
|
+
# [
|
636
|
+
# Polars.col("x").str.split_exact("_", 1).alias("fields")
|
637
|
+
# ]
|
638
|
+
# )
|
639
|
+
# # =>
|
640
|
+
# # shape: (4, 1)
|
641
|
+
# # ┌─────────────┐
|
642
|
+
# # │ fields │
|
643
|
+
# # │ --- │
|
644
|
+
# # │ struct[2] │
|
645
|
+
# # ╞═════════════╡
|
646
|
+
# # │ {"a","1"} │
|
647
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
648
|
+
# # │ {null,null} │
|
649
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
650
|
+
# # │ {"c",null} │
|
651
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
652
|
+
# # │ {"d","4"} │
|
653
|
+
# # └─────────────┘
|
106
654
|
def split_exact(by, n, inclusive: false)
|
107
655
|
if inclusive
|
108
656
|
Utils.wrap_expr(_rbexpr.str_split_exact_inclusive(by, n))
|
@@ -111,22 +659,135 @@ module Polars
|
|
111
659
|
end
|
112
660
|
end
|
113
661
|
|
662
|
+
# Split the string by a substring, restricted to returning at most ``n`` items.
|
663
|
+
#
|
664
|
+
# If the number of possible splits is less than ``n-1``, the remaining field
|
665
|
+
# elements will be null. If the number of possible splits is ``n-1`` or greater,
|
666
|
+
# the last (nth) substring will contain the remainder of the string.
|
667
|
+
#
|
668
|
+
# @param by [String]
|
669
|
+
# Substring to split by.
|
670
|
+
# @param n [Integer]
|
671
|
+
# Max number of items to return.
|
672
|
+
#
|
673
|
+
# @return [Expr]
|
674
|
+
#
|
675
|
+
# @example
|
676
|
+
# df = Polars::DataFrame.new({"s" => ["foo bar", nil, "foo-bar", "foo bar baz"]})
|
677
|
+
# df.select(Polars.col("s").str.splitn(" ", 2).alias("fields"))
|
678
|
+
# # =>
|
679
|
+
# # shape: (4, 1)
|
680
|
+
# # ┌───────────────────┐
|
681
|
+
# # │ fields │
|
682
|
+
# # │ --- │
|
683
|
+
# # │ struct[2] │
|
684
|
+
# # ╞═══════════════════╡
|
685
|
+
# # │ {"foo","bar"} │
|
686
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
687
|
+
# # │ {null,null} │
|
688
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
689
|
+
# # │ {"foo-bar",null} │
|
690
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
|
691
|
+
# # │ {"foo","bar baz"} │
|
692
|
+
# # └───────────────────┘
|
114
693
|
def splitn(by, n)
|
115
694
|
Utils.wrap_expr(_rbexpr.str_splitn(by, n))
|
116
695
|
end
|
117
696
|
|
118
|
-
|
697
|
+
# Replace first matching regex/literal substring with a new string value.
|
698
|
+
#
|
699
|
+
# @param pattern [String]
|
700
|
+
# Regex pattern.
|
701
|
+
# @param value [String]
|
702
|
+
# Replacement string.
|
703
|
+
# @param literal [Boolean]
|
704
|
+
# Treat pattern as a literal string.
|
705
|
+
#
|
706
|
+
# @return [Expr]
|
707
|
+
#
|
708
|
+
# @example
|
709
|
+
# df = Polars::DataFrame.new({"id" => [1, 2], "text" => ["123abc", "abc456"]})
|
710
|
+
# df.with_column(
|
711
|
+
# Polars.col("text").str.replace('abc\b', "ABC")
|
712
|
+
# )
|
713
|
+
# # =>
|
714
|
+
# # shape: (2, 2)
|
715
|
+
# # ┌─────┬────────┐
|
716
|
+
# # │ id ┆ text │
|
717
|
+
# # │ --- ┆ --- │
|
718
|
+
# # │ i64 ┆ str │
|
719
|
+
# # ╞═════╪════════╡
|
720
|
+
# # │ 1 ┆ 123ABC │
|
721
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
|
722
|
+
# # │ 2 ┆ abc456 │
|
723
|
+
# # └─────┴────────┘
|
724
|
+
def replace(pattern, value, literal: false)
|
119
725
|
pattern = Utils.expr_to_lit_or_expr(pattern, str_to_lit: true)
|
120
726
|
value = Utils.expr_to_lit_or_expr(value, str_to_lit: true)
|
121
727
|
Utils.wrap_expr(_rbexpr.str_replace(pattern._rbexpr, value._rbexpr, literal))
|
122
728
|
end
|
123
729
|
|
124
|
-
|
730
|
+
# Replace all matching regex/literal substrings with a new string value.
|
731
|
+
#
|
732
|
+
# @param pattern [String]
|
733
|
+
# Regex pattern.
|
734
|
+
# @param value [String]
|
735
|
+
# Replacement string.
|
736
|
+
# @param literal [Boolean]
|
737
|
+
# Treat pattern as a literal string.
|
738
|
+
#
|
739
|
+
# @return [Expr]
|
740
|
+
#
|
741
|
+
# @example
|
742
|
+
# df = Polars::DataFrame.new({"id" => [1, 2], "text" => ["abcabc", "123a123"]})
|
743
|
+
# df.with_column(Polars.col("text").str.replace_all("a", "-"))
|
744
|
+
# # =>
|
745
|
+
# # shape: (2, 2)
|
746
|
+
# # ┌─────┬─────────┐
|
747
|
+
# # │ id ┆ text │
|
748
|
+
# # │ --- ┆ --- │
|
749
|
+
# # │ i64 ┆ str │
|
750
|
+
# # ╞═════╪═════════╡
|
751
|
+
# # │ 1 ┆ -bc-bc │
|
752
|
+
# # ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
|
753
|
+
# # │ 2 ┆ 123-123 │
|
754
|
+
# # └─────┴─────────┘
|
755
|
+
def replace_all(pattern, value, literal: false)
|
125
756
|
pattern = Utils.expr_to_lit_or_expr(pattern, str_to_lit: true)
|
126
757
|
value = Utils.expr_to_lit_or_expr(value, str_to_lit: true)
|
127
758
|
Utils.wrap_expr(_rbexpr.str_replace_all(pattern._rbexpr, value._rbexpr, literal))
|
128
759
|
end
|
129
760
|
|
761
|
+
# Create subslices of the string values of a Utf8 Series.
|
762
|
+
#
|
763
|
+
# @param offset [Integer]
|
764
|
+
# Start index. Negative indexing is supported.
|
765
|
+
# @param length [Integer]
|
766
|
+
# Length of the slice. If set to `nil` (default), the slice is taken to the
|
767
|
+
# end of the string.
|
768
|
+
#
|
769
|
+
# @return [Expr]
|
770
|
+
#
|
771
|
+
# @example
|
772
|
+
# df = Polars::DataFrame.new({"s" => ["pear", nil, "papaya", "dragonfruit"]})
|
773
|
+
# df.with_column(
|
774
|
+
# Polars.col("s").str.slice(-3).alias("s_sliced")
|
775
|
+
# )
|
776
|
+
# # =>
|
777
|
+
# # shape: (4, 2)
|
778
|
+
# # ┌─────────────┬──────────┐
|
779
|
+
# # │ s ┆ s_sliced │
|
780
|
+
# # │ --- ┆ --- │
|
781
|
+
# # │ str ┆ str │
|
782
|
+
# # ╞═════════════╪══════════╡
|
783
|
+
# # │ pear ┆ ear │
|
784
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
|
785
|
+
# # │ null ┆ null │
|
786
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
|
787
|
+
# # │ papaya ┆ aya │
|
788
|
+
# # ├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
|
789
|
+
# # │ dragonfruit ┆ uit │
|
790
|
+
# # └─────────────┴──────────┘
|
130
791
|
def slice(offset, length = nil)
|
131
792
|
Utils.wrap_expr(_rbexpr.str_slice(offset, length))
|
132
793
|
end
|