polars-df 0.21.0 → 0.21.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/Cargo.lock +1 -1
- data/ext/polars/Cargo.toml +7 -1
- data/ext/polars/src/conversion/mod.rs +92 -4
- data/ext/polars/src/exceptions.rs +1 -0
- data/ext/polars/src/expr/array.rs +73 -4
- data/ext/polars/src/expr/binary.rs +26 -1
- data/ext/polars/src/expr/bitwise.rs +39 -0
- data/ext/polars/src/expr/categorical.rs +20 -0
- data/ext/polars/src/expr/datatype.rs +24 -1
- data/ext/polars/src/expr/datetime.rs +58 -0
- data/ext/polars/src/expr/general.rs +84 -5
- data/ext/polars/src/expr/list.rs +24 -0
- data/ext/polars/src/expr/meta.rs +11 -0
- data/ext/polars/src/expr/mod.rs +1 -0
- data/ext/polars/src/expr/name.rs +8 -0
- data/ext/polars/src/expr/rolling.rs +20 -0
- data/ext/polars/src/expr/string.rs +59 -0
- data/ext/polars/src/expr/struct.rs +9 -1
- data/ext/polars/src/functions/io.rs +19 -0
- data/ext/polars/src/functions/lazy.rs +4 -0
- data/ext/polars/src/lazyframe/general.rs +51 -0
- data/ext/polars/src/lib.rs +119 -10
- data/ext/polars/src/map/dataframe.rs +2 -2
- data/ext/polars/src/map/series.rs +1 -1
- data/ext/polars/src/series/aggregation.rs +44 -0
- data/ext/polars/src/series/general.rs +64 -4
- data/lib/polars/array_expr.rb +382 -3
- data/lib/polars/array_name_space.rb +281 -0
- data/lib/polars/binary_expr.rb +67 -0
- data/lib/polars/binary_name_space.rb +43 -0
- data/lib/polars/cat_expr.rb +224 -0
- data/lib/polars/cat_name_space.rb +138 -0
- data/lib/polars/config.rb +2 -2
- data/lib/polars/convert.rb +6 -6
- data/lib/polars/data_frame.rb +684 -19
- data/lib/polars/data_type_expr.rb +52 -0
- data/lib/polars/data_types.rb +14 -2
- data/lib/polars/date_time_expr.rb +251 -0
- data/lib/polars/date_time_name_space.rb +299 -0
- data/lib/polars/expr.rb +1213 -180
- data/lib/polars/functions/datatype.rb +21 -0
- data/lib/polars/functions/lazy.rb +13 -0
- data/lib/polars/io/csv.rb +1 -1
- data/lib/polars/io/json.rb +4 -4
- data/lib/polars/io/ndjson.rb +4 -4
- data/lib/polars/io/parquet.rb +27 -5
- data/lib/polars/lazy_frame.rb +936 -20
- data/lib/polars/list_expr.rb +196 -4
- data/lib/polars/list_name_space.rb +201 -4
- data/lib/polars/meta_expr.rb +64 -0
- data/lib/polars/name_expr.rb +36 -0
- data/lib/polars/schema.rb +79 -3
- data/lib/polars/selector.rb +72 -0
- data/lib/polars/selectors.rb +3 -3
- data/lib/polars/series.rb +1051 -54
- data/lib/polars/string_expr.rb +411 -6
- data/lib/polars/string_name_space.rb +722 -49
- data/lib/polars/struct_expr.rb +103 -0
- data/lib/polars/struct_name_space.rb +19 -1
- data/lib/polars/utils/various.rb +18 -1
- data/lib/polars/utils.rb +5 -1
- data/lib/polars/version.rb +1 -1
- data/lib/polars.rb +2 -0
- metadata +4 -1
@@ -74,6 +74,60 @@ module Polars
|
|
74
74
|
super
|
75
75
|
end
|
76
76
|
|
77
|
+
# Compute the std of the values of the sub-arrays.
|
78
|
+
#
|
79
|
+
# @return [Series]
|
80
|
+
#
|
81
|
+
# @example
|
82
|
+
# s = Polars::Series.new("a", [[1, 2], [4, 3]], dtype: Polars::Array.new(Polars::Int64, 2))
|
83
|
+
# s.arr.std
|
84
|
+
# # =>
|
85
|
+
# # shape: (2,)
|
86
|
+
# # Series: 'a' [f64]
|
87
|
+
# # [
|
88
|
+
# # 0.707107
|
89
|
+
# # 0.707107
|
90
|
+
# # ]
|
91
|
+
def std(ddof: 1)
|
92
|
+
super
|
93
|
+
end
|
94
|
+
|
95
|
+
# Compute the var of the values of the sub-arrays.
|
96
|
+
#
|
97
|
+
# @return [Series]
|
98
|
+
#
|
99
|
+
# @example
|
100
|
+
# s = Polars::Series.new("a", [[1, 2], [4, 3]], dtype: Polars::Array.new(Polars::Int64, 2))
|
101
|
+
# s.arr.var
|
102
|
+
# # =>
|
103
|
+
# # shape: (2,)
|
104
|
+
# # Series: 'a' [f64]
|
105
|
+
# # [
|
106
|
+
# # 0.5
|
107
|
+
# # 0.5
|
108
|
+
# # ]
|
109
|
+
def var(ddof: 1)
|
110
|
+
super
|
111
|
+
end
|
112
|
+
|
113
|
+
# Compute the median of the values of the sub-arrays.
|
114
|
+
#
|
115
|
+
# @return [Series]
|
116
|
+
#
|
117
|
+
# @example
|
118
|
+
# s = Polars::Series.new("a", [[1, 2], [4, 3]], dtype: Polars::Array.new(Polars::Int64, 2))
|
119
|
+
# s.arr.median
|
120
|
+
# # =>
|
121
|
+
# # shape: (2,)
|
122
|
+
# # Series: 'a' [f64]
|
123
|
+
# # [
|
124
|
+
# # 1.5
|
125
|
+
# # 3.5
|
126
|
+
# # ]
|
127
|
+
def median
|
128
|
+
super
|
129
|
+
end
|
130
|
+
|
77
131
|
# Get the unique/distinct values in the array.
|
78
132
|
#
|
79
133
|
# @param maintain_order [Boolean]
|
@@ -102,6 +156,24 @@ module Polars
|
|
102
156
|
super
|
103
157
|
end
|
104
158
|
|
159
|
+
# Count the number of unique values in every sub-arrays.
|
160
|
+
#
|
161
|
+
# @return [Series]
|
162
|
+
#
|
163
|
+
# @example
|
164
|
+
# s = Polars::Series.new("a", [[1, 2], [4, 4]], dtype: Polars::Array.new(Polars::Int64, 2))
|
165
|
+
# s.arr.n_unique
|
166
|
+
# # =>
|
167
|
+
# # shape: (2,)
|
168
|
+
# # Series: 'a' [u32]
|
169
|
+
# # [
|
170
|
+
# # 2
|
171
|
+
# # 1
|
172
|
+
# # ]
|
173
|
+
def n_unique
|
174
|
+
super
|
175
|
+
end
|
176
|
+
|
105
177
|
# Convert an Array column into a List column with the same inner data type.
|
106
178
|
#
|
107
179
|
# @return [Series]
|
@@ -144,6 +216,148 @@ module Polars
|
|
144
216
|
super
|
145
217
|
end
|
146
218
|
|
219
|
+
# Return the number of elements in each array.
|
220
|
+
#
|
221
|
+
# @return [Series]
|
222
|
+
#
|
223
|
+
# @example
|
224
|
+
# s = Polars::Series.new("a", [[1, 2], [4, 3]], dtype: Polars::Array.new(Polars::Int8, 2))
|
225
|
+
# s.arr.len
|
226
|
+
# # =>
|
227
|
+
# # shape: (2,)
|
228
|
+
# # Series: 'a' [u32]
|
229
|
+
# # [
|
230
|
+
# # 2
|
231
|
+
# # 2
|
232
|
+
# # ]
|
233
|
+
def len
|
234
|
+
super
|
235
|
+
end
|
236
|
+
|
237
|
+
# Slice the sub-arrays.
|
238
|
+
#
|
239
|
+
# @param offset [Integer]
|
240
|
+
# The starting index of the slice.
|
241
|
+
# @param length [Integer]
|
242
|
+
# The length of the slice.
|
243
|
+
#
|
244
|
+
# @return [Series]
|
245
|
+
#
|
246
|
+
# @example
|
247
|
+
# s = Polars::Series.new(
|
248
|
+
# [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
|
249
|
+
# dtype: Polars::Array.new(Polars::Int64, 6)
|
250
|
+
# )
|
251
|
+
# s.arr.slice(1)
|
252
|
+
# # =>
|
253
|
+
# # shape: (2,)
|
254
|
+
# # Series: '' [list[i64]]
|
255
|
+
# # [
|
256
|
+
# # [2, 3, … 6]
|
257
|
+
# # [8, 9, … 12]
|
258
|
+
# # ]
|
259
|
+
#
|
260
|
+
# @example
|
261
|
+
# s.arr.slice(1, 3, as_array: true)
|
262
|
+
# # =>
|
263
|
+
# # shape: (2,)
|
264
|
+
# # Series: '' [array[i64, 3]]
|
265
|
+
# # [
|
266
|
+
# # [2, 3, 4]
|
267
|
+
# # [8, 9, 10]
|
268
|
+
# # ]
|
269
|
+
#
|
270
|
+
# @example
|
271
|
+
# s.arr.slice(-2)
|
272
|
+
# # =>
|
273
|
+
# # shape: (2,)
|
274
|
+
# # Series: '' [list[i64]]
|
275
|
+
# # [
|
276
|
+
# # [5, 6]
|
277
|
+
# # [11, 12]
|
278
|
+
# # ]
|
279
|
+
def slice(
|
280
|
+
offset,
|
281
|
+
length = nil,
|
282
|
+
as_array: false
|
283
|
+
)
|
284
|
+
super
|
285
|
+
end
|
286
|
+
|
287
|
+
# Get the first `n` elements of the sub-arrays.
|
288
|
+
#
|
289
|
+
# @param n [Integer]
|
290
|
+
# Number of values to return for each sublist.
|
291
|
+
# @param as_array [Boolean]
|
292
|
+
# Return result as a fixed-length `Array`, otherwise as a `List`.
|
293
|
+
# If true `n` must be a constant value.
|
294
|
+
#
|
295
|
+
# @return [Series]
|
296
|
+
#
|
297
|
+
# @example
|
298
|
+
# s = Polars::Series.new(
|
299
|
+
# [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
|
300
|
+
# dtype: Polars::Array.new(Polars::Int64, 6)
|
301
|
+
# )
|
302
|
+
# s.arr.head
|
303
|
+
# # =>
|
304
|
+
# # shape: (2,)
|
305
|
+
# # Series: '' [list[i64]]
|
306
|
+
# # [
|
307
|
+
# # [1, 2, … 5]
|
308
|
+
# # [7, 8, … 11]
|
309
|
+
# # ]
|
310
|
+
#
|
311
|
+
# @example
|
312
|
+
# s.arr.head(3, as_array: true)
|
313
|
+
# # =>
|
314
|
+
# # shape: (2,)
|
315
|
+
# # Series: '' [array[i64, 3]]
|
316
|
+
# # [
|
317
|
+
# # [1, 2, 3]
|
318
|
+
# # [7, 8, 9]
|
319
|
+
# # ]
|
320
|
+
def head(n = 5, as_array: false)
|
321
|
+
super
|
322
|
+
end
|
323
|
+
|
324
|
+
# Slice the last `n` values of every sublist.
|
325
|
+
#
|
326
|
+
# @param n [Integer]
|
327
|
+
# Number of values to return for each sublist.
|
328
|
+
# @param as_array [Boolean]
|
329
|
+
# Return result as a fixed-length `Array`, otherwise as a `List`.
|
330
|
+
# If true `n` must be a constant value.
|
331
|
+
#
|
332
|
+
# @return [Series]
|
333
|
+
#
|
334
|
+
# @example
|
335
|
+
# s = Polars::Series.new(
|
336
|
+
# [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
|
337
|
+
# dtype: Polars::Array.new(Polars::Int64, 6)
|
338
|
+
# )
|
339
|
+
# s.arr.tail
|
340
|
+
# # =>
|
341
|
+
# # shape: (2,)
|
342
|
+
# # Series: '' [list[i64]]
|
343
|
+
# # [
|
344
|
+
# # [2, 3, … 6]
|
345
|
+
# # [8, 9, … 12]
|
346
|
+
# # ]
|
347
|
+
#
|
348
|
+
# @example
|
349
|
+
# s.arr.tail(3, as_array: true)
|
350
|
+
# # =>
|
351
|
+
# # shape: (2,)
|
352
|
+
# # Series: '' [array[i64, 3]]
|
353
|
+
# # [
|
354
|
+
# # [4, 5, 6]
|
355
|
+
# # [10, 11, 12]
|
356
|
+
# # ]
|
357
|
+
def tail(n = 5, as_array: false)
|
358
|
+
super
|
359
|
+
end
|
360
|
+
|
147
361
|
# Evaluate whether all boolean values are true for every subarray.
|
148
362
|
#
|
149
363
|
# @return [Series]
|
@@ -419,5 +633,72 @@ module Polars
|
|
419
633
|
def count_matches(element)
|
420
634
|
super
|
421
635
|
end
|
636
|
+
|
637
|
+
# Convert the series of type `Array` to a series of type `Struct`.
|
638
|
+
#
|
639
|
+
# @param fields [Object]
|
640
|
+
# If the name and number of the desired fields is known in advance
|
641
|
+
# a list of field names can be given, which will be assigned by index.
|
642
|
+
# Otherwise, to dynamically assign field names, a custom function can be
|
643
|
+
# used; if neither are set, fields will be `field_0, field_1 .. field_n`.
|
644
|
+
#
|
645
|
+
# @return [Series]
|
646
|
+
#
|
647
|
+
# @example Convert array to struct with default field name assignment:
|
648
|
+
# s1 = Polars::Series.new("n", [[0, 1, 2], [3, 4, 5]], dtype: Polars::Array.new(Polars::Int8, 3))
|
649
|
+
# s2 = s1.arr.to_struct
|
650
|
+
# # =>
|
651
|
+
# # shape: (2,)
|
652
|
+
# # Series: 'n' [struct[3]]
|
653
|
+
# # [
|
654
|
+
# # {0,1,2}
|
655
|
+
# # {3,4,5}
|
656
|
+
# # ]
|
657
|
+
#
|
658
|
+
# @example
|
659
|
+
# s2.struct.fields
|
660
|
+
# # => ["field_0", "field_1", "field_2"]
|
661
|
+
def to_struct(
|
662
|
+
fields: nil
|
663
|
+
)
|
664
|
+
s = Utils.wrap_s(_s)
|
665
|
+
s.to_frame.select(F.col(s.name).arr.to_struct(fields: fields)).to_series
|
666
|
+
end
|
667
|
+
|
668
|
+
# Shift array values by the given number of indices.
|
669
|
+
#
|
670
|
+
# @param n [Integer]
|
671
|
+
# Number of indices to shift forward. If a negative value is passed, values
|
672
|
+
# are shifted in the opposite direction instead.
|
673
|
+
#
|
674
|
+
# @return [Series]
|
675
|
+
#
|
676
|
+
# @note
|
677
|
+
# This method is similar to the `LAG` operation in SQL when the value for `n`
|
678
|
+
# is positive. With a negative value for `n`, it is similar to `LEAD`.
|
679
|
+
#
|
680
|
+
# @example By default, array values are shifted forward by one index.
|
681
|
+
# s = Polars::Series.new([[1, 2, 3], [4, 5, 6]], dtype: Polars::Array.new(Polars::Int64, 3))
|
682
|
+
# s.arr.shift
|
683
|
+
# # =>
|
684
|
+
# # shape: (2,)
|
685
|
+
# # Series: '' [array[i64, 3]]
|
686
|
+
# # [
|
687
|
+
# # [null, 1, 2]
|
688
|
+
# # [null, 4, 5]
|
689
|
+
# # ]
|
690
|
+
#
|
691
|
+
# @example Pass a negative value to shift in the opposite direction instead.
|
692
|
+
# s.arr.shift(-2)
|
693
|
+
# # =>
|
694
|
+
# # shape: (2,)
|
695
|
+
# # Series: '' [array[i64, 3]]
|
696
|
+
# # [
|
697
|
+
# # [3, null, null]
|
698
|
+
# # [6, null, null]
|
699
|
+
# # ]
|
700
|
+
def shift(n = 1)
|
701
|
+
super
|
702
|
+
end
|
422
703
|
end
|
423
704
|
end
|
data/lib/polars/binary_expr.rb
CHANGED
@@ -197,5 +197,72 @@ module Polars
|
|
197
197
|
raise ArgumentError, "encoding must be one of {{'hex', 'base64'}}, got #{encoding}"
|
198
198
|
end
|
199
199
|
end
|
200
|
+
|
201
|
+
# Get the size of binary values in the given unit.
|
202
|
+
#
|
203
|
+
# @param unit ['b', 'kb', 'mb', 'gb', 'tb']
|
204
|
+
# Scale the returned size to the given unit.
|
205
|
+
#
|
206
|
+
# @return [Expr]
|
207
|
+
#
|
208
|
+
# @example
|
209
|
+
# df = Polars::DataFrame.new({"data" => [512, 256, 1024].map { |n| "\x00".b * n }})
|
210
|
+
# df.with_columns(
|
211
|
+
# n_bytes: Polars.col("data").bin.size,
|
212
|
+
# n_kilobytes: Polars.col("data").bin.size("kb")
|
213
|
+
# )
|
214
|
+
# # =>
|
215
|
+
# # shape: (3, 3)
|
216
|
+
# # ┌─────────────────────────────────┬─────────┬─────────────┐
|
217
|
+
# # │ data ┆ n_bytes ┆ n_kilobytes │
|
218
|
+
# # │ --- ┆ --- ┆ --- │
|
219
|
+
# # │ binary ┆ u32 ┆ f64 │
|
220
|
+
# # ╞═════════════════════════════════╪═════════╪═════════════╡
|
221
|
+
# # │ b"\x00\x00\x00\x00\x00\x00\x00… ┆ 512 ┆ 0.5 │
|
222
|
+
# # │ b"\x00\x00\x00\x00\x00\x00\x00… ┆ 256 ┆ 0.25 │
|
223
|
+
# # │ b"\x00\x00\x00\x00\x00\x00\x00… ┆ 1024 ┆ 1.0 │
|
224
|
+
# # └─────────────────────────────────┴─────────┴─────────────┘
|
225
|
+
def size(unit = "b")
|
226
|
+
sz = Utils.wrap_expr(_rbexpr.bin_size_bytes)
|
227
|
+
sz = Utils.scale_bytes(sz, to: unit)
|
228
|
+
sz
|
229
|
+
end
|
230
|
+
|
231
|
+
# Interpret a buffer as a numerical Polars type.
|
232
|
+
#
|
233
|
+
# @param dtype [Object]
|
234
|
+
# Which type to interpret binary column into.
|
235
|
+
# @param endianness : ["big", "little"]
|
236
|
+
# Which endianness to use when interpreting bytes, by default "little".
|
237
|
+
#
|
238
|
+
# @return [Expr]
|
239
|
+
#
|
240
|
+
# @example
|
241
|
+
# df = Polars::DataFrame.new({"data" => ["\x05\x00\x00\x00".b, "\x10\x00\x01\x00".b]})
|
242
|
+
# df.with_columns(
|
243
|
+
# bin2int: Polars.col("data").bin.reinterpret(
|
244
|
+
# dtype: Polars::Int32, endianness: "little"
|
245
|
+
# )
|
246
|
+
# )
|
247
|
+
# # =>
|
248
|
+
# # shape: (2, 2)
|
249
|
+
# # ┌─────────────────────┬─────────┐
|
250
|
+
# # │ data ┆ bin2int │
|
251
|
+
# # │ --- ┆ --- │
|
252
|
+
# # │ binary ┆ i32 │
|
253
|
+
# # ╞═════════════════════╪═════════╡
|
254
|
+
# # │ b"\x05\x00\x00\x00" ┆ 5 │
|
255
|
+
# # │ b"\x10\x00\x01\x00" ┆ 65552 │
|
256
|
+
# # └─────────────────────┴─────────┘
|
257
|
+
def reinterpret(
|
258
|
+
dtype:,
|
259
|
+
endianness: "little"
|
260
|
+
)
|
261
|
+
dtype = Utils.parse_into_datatype_expr(dtype)
|
262
|
+
|
263
|
+
Utils.wrap_expr(
|
264
|
+
_rbexpr.bin_reinterpret(dtype._rbdatatype_expr, endianness)
|
265
|
+
)
|
266
|
+
end
|
200
267
|
end
|
201
268
|
end
|
@@ -157,5 +157,48 @@ module Polars
|
|
157
157
|
def encode(encoding)
|
158
158
|
super
|
159
159
|
end
|
160
|
+
|
161
|
+
# Get the size of the binary values in a Series in the given unit.
|
162
|
+
#
|
163
|
+
# @return [Series]
|
164
|
+
#
|
165
|
+
# @example
|
166
|
+
# s = Polars::Series.new("data", [512, 256, 2560, 1024].map { |n| "\x00".b * n })
|
167
|
+
# s.bin.size("kb")
|
168
|
+
# # =>
|
169
|
+
# # shape: (4,)
|
170
|
+
# # Series: 'data' [f64]
|
171
|
+
# # [
|
172
|
+
# # 0.5
|
173
|
+
# # 0.25
|
174
|
+
# # 2.5
|
175
|
+
# # 1.0
|
176
|
+
# # ]
|
177
|
+
def size(unit = "b")
|
178
|
+
super
|
179
|
+
end
|
180
|
+
|
181
|
+
# Interpret a buffer as a numerical polars type.
|
182
|
+
#
|
183
|
+
# @param dtype [Object]
|
184
|
+
# Which type to interpret binary column into.
|
185
|
+
# @param endianness ["big", "little"]
|
186
|
+
# Which endianness to use when interpreting bytes, by default "little".
|
187
|
+
#
|
188
|
+
# @return [Series]
|
189
|
+
#
|
190
|
+
# @example
|
191
|
+
# s = Polars::Series.new("data", ["\x05\x00\x00\x00".b, "\x10\x00\x01\x00".b])
|
192
|
+
# s.bin.reinterpret(dtype: Polars::Int32, endianness: "little")
|
193
|
+
# # =>
|
194
|
+
# # shape: (2,)
|
195
|
+
# # Series: 'data' [i32]
|
196
|
+
# # [
|
197
|
+
# # 5
|
198
|
+
# # 65552
|
199
|
+
# # ]
|
200
|
+
def reinterpret(dtype:, endianness: "little")
|
201
|
+
super
|
202
|
+
end
|
160
203
|
end
|
161
204
|
end
|
data/lib/polars/cat_expr.rb
CHANGED
@@ -32,5 +32,229 @@ module Polars
|
|
32
32
|
def get_categories
|
33
33
|
Utils.wrap_expr(_rbexpr.cat_get_categories)
|
34
34
|
end
|
35
|
+
|
36
|
+
# Return the byte-length of the string representation of each value.
|
37
|
+
#
|
38
|
+
# @return [Expr]
|
39
|
+
#
|
40
|
+
# @note
|
41
|
+
# When working with non-ASCII text, the length in bytes is not the same as the
|
42
|
+
# length in characters. You may want to use `len_chars` instead.
|
43
|
+
# Note that `len_bytes` is much more performant (_O(1)_) than
|
44
|
+
# `len_chars` (_O(n)_).
|
45
|
+
#
|
46
|
+
# @example
|
47
|
+
# df = Polars::DataFrame.new(
|
48
|
+
# {"a" => Polars::Series.new(["Café", "345", "東京", nil], dtype: Polars::Categorical)}
|
49
|
+
# )
|
50
|
+
# df.with_columns(
|
51
|
+
# Polars.col("a").cat.len_bytes.alias("n_bytes"),
|
52
|
+
# Polars.col("a").cat.len_chars.alias("n_chars")
|
53
|
+
# )
|
54
|
+
# # =>
|
55
|
+
# # shape: (4, 3)
|
56
|
+
# # ┌──────┬─────────┬─────────┐
|
57
|
+
# # │ a ┆ n_bytes ┆ n_chars │
|
58
|
+
# # │ --- ┆ --- ┆ --- │
|
59
|
+
# # │ cat ┆ u32 ┆ u32 │
|
60
|
+
# # ╞══════╪═════════╪═════════╡
|
61
|
+
# # │ Café ┆ 5 ┆ 4 │
|
62
|
+
# # │ 345 ┆ 3 ┆ 3 │
|
63
|
+
# # │ 東京 ┆ 6 ┆ 2 │
|
64
|
+
# # │ null ┆ null ┆ null │
|
65
|
+
# # └──────┴─────────┴─────────┘
|
66
|
+
def len_bytes
|
67
|
+
Utils.wrap_expr(_rbexpr.cat_len_bytes)
|
68
|
+
end
|
69
|
+
|
70
|
+
# Return the number of characters of the string representation of each value.
|
71
|
+
#
|
72
|
+
# @return [Expr]
|
73
|
+
#
|
74
|
+
# @note
|
75
|
+
# When working with ASCII text, use `len_bytes` instead to achieve
|
76
|
+
# equivalent output with much better performance:
|
77
|
+
# `len_bytes` runs in _O(1)_, while `len_chars` runs in (_O(n)_).
|
78
|
+
#
|
79
|
+
# A character is defined as a [Unicode scalar value](https://www.unicode.org/glossary/#unicode_scalar_value). A single character is
|
80
|
+
# represented by a single byte when working with ASCII text, and a maximum of
|
81
|
+
# 4 bytes otherwise.
|
82
|
+
#
|
83
|
+
# @example
|
84
|
+
# df = Polars::DataFrame.new(
|
85
|
+
# {"a" => Polars::Series.new(["Café", "345", "東京", nil], dtype: Polars::Categorical)}
|
86
|
+
# )
|
87
|
+
# df.with_columns(
|
88
|
+
# Polars.col("a").cat.len_chars.alias("n_chars"),
|
89
|
+
# Polars.col("a").cat.len_bytes.alias("n_bytes")
|
90
|
+
# )
|
91
|
+
# # =>
|
92
|
+
# # shape: (4, 3)
|
93
|
+
# # ┌──────┬─────────┬─────────┐
|
94
|
+
# # │ a ┆ n_chars ┆ n_bytes │
|
95
|
+
# # │ --- ┆ --- ┆ --- │
|
96
|
+
# # │ cat ┆ u32 ┆ u32 │
|
97
|
+
# # ╞══════╪═════════╪═════════╡
|
98
|
+
# # │ Café ┆ 4 ┆ 5 │
|
99
|
+
# # │ 345 ┆ 3 ┆ 3 │
|
100
|
+
# # │ 東京 ┆ 2 ┆ 6 │
|
101
|
+
# # │ null ┆ null ┆ null │
|
102
|
+
# # └──────┴─────────┴─────────┘
|
103
|
+
def len_chars
|
104
|
+
Utils.wrap_expr(_rbexpr.cat_len_chars)
|
105
|
+
end
|
106
|
+
|
107
|
+
# Check if string representations of values start with a substring.
|
108
|
+
#
|
109
|
+
# @param prefix [String]
|
110
|
+
# Prefix substring.
|
111
|
+
#
|
112
|
+
# @return [Expr]
|
113
|
+
#
|
114
|
+
# @note
|
115
|
+
# Whereas `str.starts_with` allows expression inputs, `cat.starts_with` requires
|
116
|
+
# a literal string value.
|
117
|
+
#
|
118
|
+
# @example
|
119
|
+
# df = Polars::DataFrame.new(
|
120
|
+
# {"fruits" => Polars::Series.new(["apple", "mango", nil], dtype: Polars::Categorical)}
|
121
|
+
# )
|
122
|
+
# df.with_columns(
|
123
|
+
# Polars.col("fruits").cat.starts_with("app").alias("has_prefix")
|
124
|
+
# )
|
125
|
+
# # =>
|
126
|
+
# # shape: (3, 2)
|
127
|
+
# # ┌────────┬────────────┐
|
128
|
+
# # │ fruits ┆ has_prefix │
|
129
|
+
# # │ --- ┆ --- │
|
130
|
+
# # │ cat ┆ bool │
|
131
|
+
# # ╞════════╪════════════╡
|
132
|
+
# # │ apple ┆ true │
|
133
|
+
# # │ mango ┆ false │
|
134
|
+
# # │ null ┆ null │
|
135
|
+
# # └────────┴────────────┘
|
136
|
+
#
|
137
|
+
# @example Using `starts_with` as a filter condition:
|
138
|
+
# df.filter(Polars.col("fruits").cat.starts_with("app"))
|
139
|
+
# # =>
|
140
|
+
# # shape: (1, 1)
|
141
|
+
# # ┌────────┐
|
142
|
+
# # │ fruits │
|
143
|
+
# # │ --- │
|
144
|
+
# # │ cat │
|
145
|
+
# # ╞════════╡
|
146
|
+
# # │ apple │
|
147
|
+
# # └────────┘
|
148
|
+
def starts_with(prefix)
|
149
|
+
if !prefix.is_a?(::String)
|
150
|
+
msg = "'prefix' must be a string; found #{prefix.inspect}"
|
151
|
+
raise TypeError, msg
|
152
|
+
end
|
153
|
+
Utils.wrap_expr(_rbexpr.cat_starts_with(prefix))
|
154
|
+
end
|
155
|
+
|
156
|
+
# Check if string representations of values end with a substring.
|
157
|
+
#
|
158
|
+
# @param suffix [String]
|
159
|
+
# Suffix substring.
|
160
|
+
#
|
161
|
+
# @return [Expr]
|
162
|
+
#
|
163
|
+
# @note
|
164
|
+
# Whereas `str.ends_with` allows expression inputs, `cat.ends_with` requires a
|
165
|
+
# literal string value.
|
166
|
+
#
|
167
|
+
# @example
|
168
|
+
# df = Polars::DataFrame.new(
|
169
|
+
# {"fruits" => Polars::Series.new(["apple", "mango", nil], dtype: Polars::Categorical)}
|
170
|
+
# )
|
171
|
+
# df.with_columns(Polars.col("fruits").cat.ends_with("go").alias("has_suffix"))
|
172
|
+
# # =>
|
173
|
+
# # shape: (3, 2)
|
174
|
+
# # ┌────────┬────────────┐
|
175
|
+
# # │ fruits ┆ has_suffix │
|
176
|
+
# # │ --- ┆ --- │
|
177
|
+
# # │ cat ┆ bool │
|
178
|
+
# # ╞════════╪════════════╡
|
179
|
+
# # │ apple ┆ false │
|
180
|
+
# # │ mango ┆ true │
|
181
|
+
# # │ null ┆ null │
|
182
|
+
# # └────────┴────────────┘
|
183
|
+
#
|
184
|
+
# @example Using `ends_with` as a filter condition:
|
185
|
+
# df.filter(Polars.col("fruits").cat.ends_with("go"))
|
186
|
+
# # =>
|
187
|
+
# # shape: (1, 1)
|
188
|
+
# # ┌────────┐
|
189
|
+
# # │ fruits │
|
190
|
+
# # │ --- │
|
191
|
+
# # │ cat │
|
192
|
+
# # ╞════════╡
|
193
|
+
# # │ mango │
|
194
|
+
# # └────────┘
|
195
|
+
def ends_with(suffix)
|
196
|
+
if !suffix.is_a?(::String)
|
197
|
+
msg = "'suffix' must be a string; found #{suffix.inspect}"
|
198
|
+
raise TypeError, msg
|
199
|
+
end
|
200
|
+
Utils.wrap_expr(_rbexpr.cat_ends_with(suffix))
|
201
|
+
end
|
202
|
+
|
203
|
+
# Extract a substring from the string representation of each value.
|
204
|
+
#
|
205
|
+
# @param offset [Integer]
|
206
|
+
# Start index. Negative indexing is supported.
|
207
|
+
# @param length [Integer]
|
208
|
+
# Length of the slice. If set to `nil` (default), the slice is taken to the
|
209
|
+
# end of the string.
|
210
|
+
#
|
211
|
+
# @return [Expr]
|
212
|
+
#
|
213
|
+
# @note
|
214
|
+
# Both the `offset` and `length` inputs are defined in terms of the number
|
215
|
+
# of characters in the (UTF8) string. A character is defined as a
|
216
|
+
# [Unicode scalar value](https://www.unicode.org/glossary/#unicode_scalar_value). A single character is represented by a single byte
|
217
|
+
# when working with ASCII text, and a maximum of 4 bytes otherwise.
|
218
|
+
#
|
219
|
+
# @example
|
220
|
+
# df = Polars::DataFrame.new(
|
221
|
+
# {
|
222
|
+
# "s" => Polars::Series.new(
|
223
|
+
# ["pear", nil, "papaya", "dragonfruit"],
|
224
|
+
# dtype: Polars::Categorical
|
225
|
+
# )
|
226
|
+
# }
|
227
|
+
# )
|
228
|
+
# df.with_columns(Polars.col("s").cat.slice(-3).alias("slice"))
|
229
|
+
# # =>
|
230
|
+
# # shape: (4, 2)
|
231
|
+
# # ┌─────────────┬───────┐
|
232
|
+
# # │ s ┆ slice │
|
233
|
+
# # │ --- ┆ --- │
|
234
|
+
# # │ cat ┆ str │
|
235
|
+
# # ╞═════════════╪═══════╡
|
236
|
+
# # │ pear ┆ ear │
|
237
|
+
# # │ null ┆ null │
|
238
|
+
# # │ papaya ┆ aya │
|
239
|
+
# # │ dragonfruit ┆ uit │
|
240
|
+
# # └─────────────┴───────┘
|
241
|
+
#
|
242
|
+
# @example Using the optional `length` parameter
|
243
|
+
# df.with_columns(Polars.col("s").cat.slice(4, 3).alias("slice"))
|
244
|
+
# # =>
|
245
|
+
# # shape: (4, 2)
|
246
|
+
# # ┌─────────────┬───────┐
|
247
|
+
# # │ s ┆ slice │
|
248
|
+
# # │ --- ┆ --- │
|
249
|
+
# # │ cat ┆ str │
|
250
|
+
# # ╞═════════════╪═══════╡
|
251
|
+
# # │ pear ┆ │
|
252
|
+
# # │ null ┆ null │
|
253
|
+
# # │ papaya ┆ ya │
|
254
|
+
# # │ dragonfruit ┆ onf │
|
255
|
+
# # └─────────────┴───────┘
|
256
|
+
def slice(offset, length = nil)
|
257
|
+
Utils.wrap_expr(_rbexpr.cat_slice(offset, length))
|
258
|
+
end
|
35
259
|
end
|
36
260
|
end
|