cudf-polars-cu13 25.10.0__py3-none-any.whl → 25.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudf_polars/GIT_COMMIT +1 -1
- cudf_polars/VERSION +1 -1
- cudf_polars/callback.py +32 -8
- cudf_polars/containers/column.py +94 -59
- cudf_polars/containers/dataframe.py +123 -34
- cudf_polars/containers/datatype.py +134 -13
- cudf_polars/dsl/expr.py +0 -2
- cudf_polars/dsl/expressions/aggregation.py +80 -28
- cudf_polars/dsl/expressions/binaryop.py +34 -14
- cudf_polars/dsl/expressions/boolean.py +110 -37
- cudf_polars/dsl/expressions/datetime.py +59 -30
- cudf_polars/dsl/expressions/literal.py +11 -5
- cudf_polars/dsl/expressions/rolling.py +460 -119
- cudf_polars/dsl/expressions/selection.py +9 -8
- cudf_polars/dsl/expressions/slicing.py +1 -1
- cudf_polars/dsl/expressions/string.py +235 -102
- cudf_polars/dsl/expressions/struct.py +19 -7
- cudf_polars/dsl/expressions/ternary.py +9 -3
- cudf_polars/dsl/expressions/unary.py +117 -58
- cudf_polars/dsl/ir.py +923 -290
- cudf_polars/dsl/to_ast.py +30 -13
- cudf_polars/dsl/tracing.py +194 -0
- cudf_polars/dsl/translate.py +294 -97
- cudf_polars/dsl/utils/aggregations.py +34 -26
- cudf_polars/dsl/utils/reshape.py +14 -2
- cudf_polars/dsl/utils/rolling.py +12 -8
- cudf_polars/dsl/utils/windows.py +35 -20
- cudf_polars/experimental/base.py +45 -2
- cudf_polars/experimental/benchmarks/pdsds.py +12 -126
- cudf_polars/experimental/benchmarks/pdsh.py +791 -1
- cudf_polars/experimental/benchmarks/utils.py +515 -39
- cudf_polars/experimental/dask_registers.py +47 -20
- cudf_polars/experimental/dispatch.py +9 -3
- cudf_polars/experimental/explain.py +15 -2
- cudf_polars/experimental/expressions.py +22 -10
- cudf_polars/experimental/groupby.py +23 -4
- cudf_polars/experimental/io.py +93 -83
- cudf_polars/experimental/join.py +39 -22
- cudf_polars/experimental/parallel.py +60 -14
- cudf_polars/experimental/rapidsmpf/__init__.py +8 -0
- cudf_polars/experimental/rapidsmpf/core.py +361 -0
- cudf_polars/experimental/rapidsmpf/dispatch.py +150 -0
- cudf_polars/experimental/rapidsmpf/io.py +604 -0
- cudf_polars/experimental/rapidsmpf/join.py +237 -0
- cudf_polars/experimental/rapidsmpf/lower.py +74 -0
- cudf_polars/experimental/rapidsmpf/nodes.py +494 -0
- cudf_polars/experimental/rapidsmpf/repartition.py +151 -0
- cudf_polars/experimental/rapidsmpf/shuffle.py +277 -0
- cudf_polars/experimental/rapidsmpf/union.py +96 -0
- cudf_polars/experimental/rapidsmpf/utils.py +162 -0
- cudf_polars/experimental/repartition.py +9 -2
- cudf_polars/experimental/select.py +177 -14
- cudf_polars/experimental/shuffle.py +28 -8
- cudf_polars/experimental/sort.py +92 -25
- cudf_polars/experimental/statistics.py +24 -5
- cudf_polars/experimental/utils.py +25 -7
- cudf_polars/testing/asserts.py +13 -8
- cudf_polars/testing/io.py +2 -1
- cudf_polars/testing/plugin.py +88 -15
- cudf_polars/typing/__init__.py +86 -32
- cudf_polars/utils/config.py +406 -58
- cudf_polars/utils/cuda_stream.py +70 -0
- cudf_polars/utils/versions.py +3 -2
- cudf_polars_cu13-25.12.0.dist-info/METADATA +182 -0
- cudf_polars_cu13-25.12.0.dist-info/RECORD +104 -0
- cudf_polars_cu13-25.10.0.dist-info/METADATA +0 -136
- cudf_polars_cu13-25.10.0.dist-info/RECORD +0 -92
- {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-25.12.0.dist-info}/WHEEL +0 -0
- {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-25.12.0.dist-info}/licenses/LICENSE +0 -0
- {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-25.12.0.dist-info}/top_level.txt +0 -0
|
@@ -10,10 +10,10 @@ import functools
|
|
|
10
10
|
import re
|
|
11
11
|
from datetime import datetime
|
|
12
12
|
from enum import IntEnum, auto
|
|
13
|
-
from typing import TYPE_CHECKING, Any, ClassVar
|
|
13
|
+
from typing import TYPE_CHECKING, Any, ClassVar, cast
|
|
14
14
|
|
|
15
|
+
from polars import Struct as pl_Struct, polars # type: ignore[attr-defined]
|
|
15
16
|
from polars.exceptions import InvalidOperationError
|
|
16
|
-
from polars.polars import dtype_str_repr
|
|
17
17
|
|
|
18
18
|
import pylibcudf as plc
|
|
19
19
|
|
|
@@ -26,8 +26,6 @@ from cudf_polars.utils.versions import POLARS_VERSION_LT_132
|
|
|
26
26
|
if TYPE_CHECKING:
|
|
27
27
|
from typing_extensions import Self
|
|
28
28
|
|
|
29
|
-
from polars.polars import _expr_nodes as pl_expr
|
|
30
|
-
|
|
31
29
|
from cudf_polars.containers import DataFrame, DataType
|
|
32
30
|
|
|
33
31
|
__all__ = ["StringFunction"]
|
|
@@ -37,10 +35,15 @@ JsonDecodeType = list[tuple[str, plc.DataType, "JsonDecodeType"]]
|
|
|
37
35
|
|
|
38
36
|
def _dtypes_for_json_decode(dtype: DataType) -> JsonDecodeType:
|
|
39
37
|
"""Get the dtypes for json decode."""
|
|
38
|
+
# Type checker doesn't narrow polars_type through dtype.id() check
|
|
40
39
|
if dtype.id() == plc.TypeId.STRUCT:
|
|
41
40
|
return [
|
|
42
|
-
(field.name, child.
|
|
43
|
-
for field, child in zip(
|
|
41
|
+
(field.name, child.plc_type, _dtypes_for_json_decode(child))
|
|
42
|
+
for field, child in zip(
|
|
43
|
+
cast(pl_Struct, dtype.polars_type).fields,
|
|
44
|
+
dtype.children,
|
|
45
|
+
strict=True,
|
|
46
|
+
)
|
|
44
47
|
]
|
|
45
48
|
else:
|
|
46
49
|
return []
|
|
@@ -96,7 +99,7 @@ class StringFunction(Expr):
|
|
|
96
99
|
ZFill = auto()
|
|
97
100
|
|
|
98
101
|
@classmethod
|
|
99
|
-
def from_polars(cls, obj:
|
|
102
|
+
def from_polars(cls, obj: polars._expr_nodes.StringFunction) -> Self:
|
|
100
103
|
"""Convert from polars' `StringFunction`."""
|
|
101
104
|
try:
|
|
102
105
|
function, name = str(obj).split(".", maxsplit=1)
|
|
@@ -278,7 +281,7 @@ class StringFunction(Expr):
|
|
|
278
281
|
and width.value is not None
|
|
279
282
|
and width.value < 0
|
|
280
283
|
): # pragma: no cover
|
|
281
|
-
dtypestr = dtype_str_repr(width.dtype.
|
|
284
|
+
dtypestr = polars.dtype_str_repr(width.dtype.polars_type)
|
|
282
285
|
raise InvalidOperationError(
|
|
283
286
|
f"conversion from `{dtypestr}` to `u64` "
|
|
284
287
|
f"failed in column 'literal' for 1 out of "
|
|
@@ -310,14 +313,17 @@ class StringFunction(Expr):
|
|
|
310
313
|
columns = [
|
|
311
314
|
Column(
|
|
312
315
|
child.evaluate(df, context=context).obj, dtype=child.dtype
|
|
313
|
-
).astype(self.dtype)
|
|
316
|
+
).astype(self.dtype, stream=df.stream)
|
|
314
317
|
for child in self.children
|
|
315
318
|
]
|
|
319
|
+
if len(columns) == 1:
|
|
320
|
+
return columns[0]
|
|
316
321
|
|
|
317
322
|
non_unit_sizes = [c.size for c in columns if c.size != 1]
|
|
318
323
|
broadcasted = broadcast(
|
|
319
324
|
*columns,
|
|
320
325
|
target_length=max(non_unit_sizes) if non_unit_sizes else None,
|
|
326
|
+
stream=df.stream,
|
|
321
327
|
)
|
|
322
328
|
|
|
323
329
|
delimiter, ignore_nulls = self.options
|
|
@@ -325,24 +331,39 @@ class StringFunction(Expr):
|
|
|
325
331
|
return Column(
|
|
326
332
|
plc.strings.combine.concatenate(
|
|
327
333
|
plc.Table([col.obj for col in broadcasted]),
|
|
328
|
-
plc.Scalar.from_py(
|
|
329
|
-
|
|
334
|
+
plc.Scalar.from_py(
|
|
335
|
+
delimiter, self.dtype.plc_type, stream=df.stream
|
|
336
|
+
),
|
|
337
|
+
None
|
|
338
|
+
if ignore_nulls
|
|
339
|
+
else plc.Scalar.from_py(
|
|
340
|
+
None, self.dtype.plc_type, stream=df.stream
|
|
341
|
+
),
|
|
330
342
|
None,
|
|
331
343
|
plc.strings.combine.SeparatorOnNulls.NO,
|
|
344
|
+
stream=df.stream,
|
|
332
345
|
),
|
|
333
346
|
dtype=self.dtype,
|
|
334
347
|
)
|
|
335
348
|
elif self.name is StringFunction.Name.ConcatVertical:
|
|
336
349
|
(child,) = self.children
|
|
337
|
-
column = child.evaluate(df, context=context).astype(
|
|
350
|
+
column = child.evaluate(df, context=context).astype(
|
|
351
|
+
self.dtype, stream=df.stream
|
|
352
|
+
)
|
|
338
353
|
delimiter, ignore_nulls = self.options
|
|
339
354
|
if column.null_count > 0 and not ignore_nulls:
|
|
340
|
-
return Column(
|
|
355
|
+
return Column(
|
|
356
|
+
plc.Column.all_null_like(column.obj, 1, stream=df.stream),
|
|
357
|
+
dtype=self.dtype,
|
|
358
|
+
)
|
|
341
359
|
return Column(
|
|
342
360
|
plc.strings.combine.join_strings(
|
|
343
361
|
column.obj,
|
|
344
|
-
plc.Scalar.from_py(
|
|
345
|
-
|
|
362
|
+
plc.Scalar.from_py(
|
|
363
|
+
delimiter, self.dtype.plc_type, stream=df.stream
|
|
364
|
+
),
|
|
365
|
+
plc.Scalar.from_py(None, self.dtype.plc_type, stream=df.stream),
|
|
366
|
+
stream=df.stream,
|
|
346
367
|
),
|
|
347
368
|
dtype=self.dtype,
|
|
348
369
|
)
|
|
@@ -351,18 +372,24 @@ class StringFunction(Expr):
|
|
|
351
372
|
# polars pads based on bytes, libcudf by visual width
|
|
352
373
|
# only pass chars if the visual width matches the byte length
|
|
353
374
|
column = self.children[0].evaluate(df, context=context)
|
|
354
|
-
col_len_bytes = plc.strings.attributes.count_bytes(
|
|
355
|
-
|
|
375
|
+
col_len_bytes = plc.strings.attributes.count_bytes(
|
|
376
|
+
column.obj, stream=df.stream
|
|
377
|
+
)
|
|
378
|
+
col_len_chars = plc.strings.attributes.count_characters(
|
|
379
|
+
column.obj, stream=df.stream
|
|
380
|
+
)
|
|
356
381
|
equal = plc.binaryop.binary_operation(
|
|
357
382
|
col_len_bytes,
|
|
358
383
|
col_len_chars,
|
|
359
384
|
plc.binaryop.BinaryOperator.NULL_EQUALS,
|
|
360
385
|
plc.DataType(plc.TypeId.BOOL8),
|
|
386
|
+
stream=df.stream,
|
|
361
387
|
)
|
|
362
388
|
if not plc.reduce.reduce(
|
|
363
389
|
equal,
|
|
364
390
|
plc.aggregation.all(),
|
|
365
391
|
plc.DataType(plc.TypeId.BOOL8),
|
|
392
|
+
stream=df.stream,
|
|
366
393
|
).to_py():
|
|
367
394
|
raise InvalidOperationError(
|
|
368
395
|
"zfill only supports ascii strings with no unicode characters"
|
|
@@ -373,22 +400,31 @@ class StringFunction(Expr):
|
|
|
373
400
|
if width.value is None:
|
|
374
401
|
return Column(
|
|
375
402
|
plc.Column.from_scalar(
|
|
376
|
-
plc.Scalar.from_py(
|
|
403
|
+
plc.Scalar.from_py(
|
|
404
|
+
None, self.dtype.plc_type, stream=df.stream
|
|
405
|
+
),
|
|
377
406
|
column.size,
|
|
407
|
+
stream=df.stream,
|
|
378
408
|
),
|
|
379
409
|
self.dtype,
|
|
380
410
|
)
|
|
381
411
|
return Column(
|
|
382
|
-
plc.strings.padding.zfill(
|
|
412
|
+
plc.strings.padding.zfill(
|
|
413
|
+
column.obj, width.value, stream=df.stream
|
|
414
|
+
),
|
|
415
|
+
self.dtype,
|
|
383
416
|
)
|
|
384
417
|
else:
|
|
385
418
|
col_width = self.children[1].evaluate(df, context=context)
|
|
386
419
|
assert isinstance(col_width, Column)
|
|
387
420
|
all_gt_0 = plc.binaryop.binary_operation(
|
|
388
421
|
col_width.obj,
|
|
389
|
-
plc.Scalar.from_py(
|
|
422
|
+
plc.Scalar.from_py(
|
|
423
|
+
0, plc.DataType(plc.TypeId.INT64), stream=df.stream
|
|
424
|
+
),
|
|
390
425
|
plc.binaryop.BinaryOperator.GREATER_EQUAL,
|
|
391
426
|
plc.DataType(plc.TypeId.BOOL8),
|
|
427
|
+
stream=df.stream,
|
|
392
428
|
)
|
|
393
429
|
|
|
394
430
|
if (
|
|
@@ -397,12 +433,15 @@ class StringFunction(Expr):
|
|
|
397
433
|
all_gt_0,
|
|
398
434
|
plc.aggregation.all(),
|
|
399
435
|
plc.DataType(plc.TypeId.BOOL8),
|
|
436
|
+
stream=df.stream,
|
|
400
437
|
).to_py()
|
|
401
438
|
): # pragma: no cover
|
|
402
439
|
raise InvalidOperationError("fill conversion failed.")
|
|
403
440
|
|
|
404
441
|
return Column(
|
|
405
|
-
plc.strings.padding.zfill_by_widths(
|
|
442
|
+
plc.strings.padding.zfill_by_widths(
|
|
443
|
+
column.obj, col_width.obj, stream=df.stream
|
|
444
|
+
),
|
|
406
445
|
self.dtype,
|
|
407
446
|
)
|
|
408
447
|
|
|
@@ -414,34 +453,39 @@ class StringFunction(Expr):
|
|
|
414
453
|
if literal:
|
|
415
454
|
pat = arg.evaluate(df, context=context)
|
|
416
455
|
pattern = (
|
|
417
|
-
pat.obj_scalar
|
|
456
|
+
pat.obj_scalar(stream=df.stream)
|
|
418
457
|
if pat.is_scalar and pat.size != column.size
|
|
419
458
|
else pat.obj
|
|
420
459
|
)
|
|
421
460
|
return Column(
|
|
422
|
-
plc.strings.find.contains(column.obj, pattern
|
|
461
|
+
plc.strings.find.contains(column.obj, pattern, stream=df.stream),
|
|
462
|
+
dtype=self.dtype,
|
|
423
463
|
)
|
|
424
464
|
else:
|
|
425
465
|
return Column(
|
|
426
|
-
plc.strings.contains.contains_re(
|
|
466
|
+
plc.strings.contains.contains_re(
|
|
467
|
+
column.obj, self._regex_program, stream=df.stream
|
|
468
|
+
),
|
|
427
469
|
dtype=self.dtype,
|
|
428
470
|
)
|
|
429
471
|
elif self.name is StringFunction.Name.ContainsAny:
|
|
430
472
|
(ascii_case_insensitive,) = self.options
|
|
431
473
|
child, arg = self.children
|
|
432
|
-
|
|
433
|
-
|
|
474
|
+
plc_column = child.evaluate(df, context=context).obj
|
|
475
|
+
plc_targets = arg.evaluate(df, context=context).obj
|
|
434
476
|
if ascii_case_insensitive:
|
|
435
|
-
|
|
436
|
-
|
|
477
|
+
plc_column = plc.strings.case.to_lower(plc_column, stream=df.stream)
|
|
478
|
+
plc_targets = plc.strings.case.to_lower(plc_targets, stream=df.stream)
|
|
437
479
|
contains = plc.strings.find_multiple.contains_multiple(
|
|
438
|
-
|
|
439
|
-
|
|
480
|
+
plc_column,
|
|
481
|
+
plc_targets,
|
|
482
|
+
stream=df.stream,
|
|
440
483
|
)
|
|
441
484
|
binary_or = functools.partial(
|
|
442
485
|
plc.binaryop.binary_operation,
|
|
443
486
|
op=plc.binaryop.BinaryOperator.BITWISE_OR,
|
|
444
|
-
output_type=self.dtype.
|
|
487
|
+
output_type=self.dtype.plc_type,
|
|
488
|
+
stream=df.stream,
|
|
445
489
|
)
|
|
446
490
|
return Column(
|
|
447
491
|
functools.reduce(binary_or, contains.columns()),
|
|
@@ -449,28 +493,30 @@ class StringFunction(Expr):
|
|
|
449
493
|
)
|
|
450
494
|
elif self.name is StringFunction.Name.CountMatches:
|
|
451
495
|
(child, _) = self.children
|
|
452
|
-
|
|
496
|
+
plc_column = child.evaluate(df, context=context).obj
|
|
453
497
|
return Column(
|
|
454
498
|
plc.unary.cast(
|
|
455
|
-
plc.strings.contains.count_re(
|
|
456
|
-
|
|
499
|
+
plc.strings.contains.count_re(
|
|
500
|
+
plc_column, self._regex_program, stream=df.stream
|
|
501
|
+
),
|
|
502
|
+
self.dtype.plc_type,
|
|
503
|
+
stream=df.stream,
|
|
457
504
|
),
|
|
458
505
|
dtype=self.dtype,
|
|
459
506
|
)
|
|
460
507
|
elif self.name is StringFunction.Name.Extract:
|
|
461
508
|
(group_index,) = self.options
|
|
462
|
-
|
|
509
|
+
plc_column = self.children[0].evaluate(df, context=context).obj
|
|
463
510
|
return Column(
|
|
464
511
|
plc.strings.extract.extract_single(
|
|
465
|
-
|
|
512
|
+
plc_column, self._regex_program, group_index - 1, stream=df.stream
|
|
466
513
|
),
|
|
467
514
|
dtype=self.dtype,
|
|
468
515
|
)
|
|
469
516
|
elif self.name is StringFunction.Name.ExtractGroups:
|
|
470
|
-
|
|
517
|
+
plc_column = self.children[0].evaluate(df, context=context).obj
|
|
471
518
|
plc_table = plc.strings.extract.extract(
|
|
472
|
-
|
|
473
|
-
self._regex_program,
|
|
519
|
+
plc_column, self._regex_program, stream=df.stream
|
|
474
520
|
)
|
|
475
521
|
return Column(
|
|
476
522
|
plc.Column.struct_from_children(plc_table.columns()),
|
|
@@ -479,38 +525,45 @@ class StringFunction(Expr):
|
|
|
479
525
|
elif self.name is StringFunction.Name.Find:
|
|
480
526
|
literal, _ = self.options
|
|
481
527
|
(child, expr) = self.children
|
|
482
|
-
|
|
528
|
+
plc_column = child.evaluate(df, context=context).obj
|
|
483
529
|
if literal:
|
|
484
530
|
assert isinstance(expr, Literal)
|
|
485
531
|
plc_column = plc.strings.find.find(
|
|
486
|
-
|
|
487
|
-
plc.Scalar.from_py(
|
|
532
|
+
plc_column,
|
|
533
|
+
plc.Scalar.from_py(
|
|
534
|
+
expr.value, expr.dtype.plc_type, stream=df.stream
|
|
535
|
+
),
|
|
536
|
+
stream=df.stream,
|
|
488
537
|
)
|
|
489
538
|
else:
|
|
490
539
|
plc_column = plc.strings.findall.find_re(
|
|
491
|
-
|
|
492
|
-
self._regex_program,
|
|
540
|
+
plc_column, self._regex_program, stream=df.stream
|
|
493
541
|
)
|
|
494
542
|
# Polars returns None for not found, libcudf returns -1
|
|
495
543
|
new_mask, null_count = plc.transform.bools_to_mask(
|
|
496
544
|
plc.binaryop.binary_operation(
|
|
497
545
|
plc_column,
|
|
498
|
-
plc.Scalar.from_py(-1, plc_column.type()),
|
|
546
|
+
plc.Scalar.from_py(-1, plc_column.type(), stream=df.stream),
|
|
499
547
|
plc.binaryop.BinaryOperator.NOT_EQUAL,
|
|
500
548
|
plc.DataType(plc.TypeId.BOOL8),
|
|
501
|
-
|
|
549
|
+
stream=df.stream,
|
|
550
|
+
),
|
|
551
|
+
stream=df.stream,
|
|
502
552
|
)
|
|
503
553
|
plc_column = plc.unary.cast(
|
|
504
|
-
plc_column.with_mask(new_mask, null_count),
|
|
554
|
+
plc_column.with_mask(new_mask, null_count),
|
|
555
|
+
self.dtype.plc_type,
|
|
556
|
+
stream=df.stream,
|
|
505
557
|
)
|
|
506
558
|
return Column(plc_column, dtype=self.dtype)
|
|
507
559
|
elif self.name is StringFunction.Name.JsonDecode:
|
|
508
560
|
plc_column = self.children[0].evaluate(df, context=context).obj
|
|
509
561
|
plc_table_with_metadata = plc.io.json.read_json_from_string_column(
|
|
510
562
|
plc_column,
|
|
511
|
-
plc.Scalar.from_py("\n"),
|
|
512
|
-
plc.Scalar.from_py("NULL"),
|
|
563
|
+
plc.Scalar.from_py("\n", stream=df.stream),
|
|
564
|
+
plc.Scalar.from_py("NULL", stream=df.stream),
|
|
513
565
|
_dtypes_for_json_decode(self.dtype),
|
|
566
|
+
stream=df.stream,
|
|
514
567
|
)
|
|
515
568
|
return Column(
|
|
516
569
|
plc.Column.struct_from_children(plc_table_with_metadata.columns),
|
|
@@ -518,26 +571,34 @@ class StringFunction(Expr):
|
|
|
518
571
|
)
|
|
519
572
|
elif self.name is StringFunction.Name.JsonPathMatch:
|
|
520
573
|
(child, expr) = self.children
|
|
521
|
-
|
|
574
|
+
plc_column = child.evaluate(df, context=context).obj
|
|
522
575
|
assert isinstance(expr, Literal)
|
|
523
|
-
json_path = plc.Scalar.from_py(
|
|
576
|
+
json_path = plc.Scalar.from_py(
|
|
577
|
+
expr.value, expr.dtype.plc_type, stream=df.stream
|
|
578
|
+
)
|
|
524
579
|
return Column(
|
|
525
|
-
plc.json.get_json_object(
|
|
580
|
+
plc.json.get_json_object(plc_column, json_path, stream=df.stream),
|
|
526
581
|
dtype=self.dtype,
|
|
527
582
|
)
|
|
528
583
|
elif self.name is StringFunction.Name.LenBytes:
|
|
529
|
-
|
|
584
|
+
plc_column = self.children[0].evaluate(df, context=context).obj
|
|
530
585
|
return Column(
|
|
531
586
|
plc.unary.cast(
|
|
532
|
-
plc.strings.attributes.count_bytes(
|
|
587
|
+
plc.strings.attributes.count_bytes(plc_column, stream=df.stream),
|
|
588
|
+
self.dtype.plc_type,
|
|
589
|
+
stream=df.stream,
|
|
533
590
|
),
|
|
534
591
|
dtype=self.dtype,
|
|
535
592
|
)
|
|
536
593
|
elif self.name is StringFunction.Name.LenChars:
|
|
537
|
-
|
|
594
|
+
plc_column = self.children[0].evaluate(df, context=context).obj
|
|
538
595
|
return Column(
|
|
539
596
|
plc.unary.cast(
|
|
540
|
-
plc.strings.attributes.count_characters(
|
|
597
|
+
plc.strings.attributes.count_characters(
|
|
598
|
+
plc_column, stream=df.stream
|
|
599
|
+
),
|
|
600
|
+
self.dtype.plc_type,
|
|
601
|
+
stream=df.stream,
|
|
541
602
|
),
|
|
542
603
|
dtype=self.dtype,
|
|
543
604
|
)
|
|
@@ -567,8 +628,13 @@ class StringFunction(Expr):
|
|
|
567
628
|
return Column(
|
|
568
629
|
plc.strings.slice.slice_strings(
|
|
569
630
|
column.obj,
|
|
570
|
-
plc.Scalar.from_py(
|
|
571
|
-
|
|
631
|
+
plc.Scalar.from_py(
|
|
632
|
+
start, plc.DataType(plc.TypeId.INT32), stream=df.stream
|
|
633
|
+
),
|
|
634
|
+
plc.Scalar.from_py(
|
|
635
|
+
stop, plc.DataType(plc.TypeId.INT32), stream=df.stream
|
|
636
|
+
),
|
|
637
|
+
stream=df.stream,
|
|
572
638
|
),
|
|
573
639
|
dtype=self.dtype,
|
|
574
640
|
)
|
|
@@ -582,7 +648,7 @@ class StringFunction(Expr):
|
|
|
582
648
|
column = child.evaluate(df, context=context)
|
|
583
649
|
if n == 1 and self.name is StringFunction.Name.SplitN:
|
|
584
650
|
plc_column = plc.Column(
|
|
585
|
-
self.dtype.
|
|
651
|
+
self.dtype.plc_type,
|
|
586
652
|
column.obj.size(),
|
|
587
653
|
None,
|
|
588
654
|
None,
|
|
@@ -592,7 +658,9 @@ class StringFunction(Expr):
|
|
|
592
658
|
)
|
|
593
659
|
else:
|
|
594
660
|
assert isinstance(expr, Literal)
|
|
595
|
-
by = plc.Scalar.from_py(
|
|
661
|
+
by = plc.Scalar.from_py(
|
|
662
|
+
expr.value, expr.dtype.plc_type, stream=df.stream
|
|
663
|
+
)
|
|
596
664
|
# See https://github.com/pola-rs/polars/issues/11640
|
|
597
665
|
# for SplitN vs SplitExact edge case behaviors
|
|
598
666
|
max_splits = n if is_split_n else 0
|
|
@@ -600,13 +668,16 @@ class StringFunction(Expr):
|
|
|
600
668
|
column.obj,
|
|
601
669
|
by,
|
|
602
670
|
max_splits - 1,
|
|
671
|
+
stream=df.stream,
|
|
603
672
|
)
|
|
604
673
|
children = plc_table.columns()
|
|
605
674
|
ref_column = children[0]
|
|
606
675
|
if (remainder := n - len(children)) > 0:
|
|
607
676
|
# Reach expected number of splits by padding with nulls
|
|
608
677
|
children.extend(
|
|
609
|
-
plc.Column.all_null_like(
|
|
678
|
+
plc.Column.all_null_like(
|
|
679
|
+
ref_column, ref_column.size(), stream=df.stream
|
|
680
|
+
)
|
|
610
681
|
for _ in range(remainder + int(not is_split_n))
|
|
611
682
|
)
|
|
612
683
|
if not is_split_n:
|
|
@@ -614,7 +685,7 @@ class StringFunction(Expr):
|
|
|
614
685
|
# TODO: Use plc.Column.struct_from_children once it is generalized
|
|
615
686
|
# to handle columns that don't share the same null_mask/null_count
|
|
616
687
|
plc_column = plc.Column(
|
|
617
|
-
self.dtype.
|
|
688
|
+
self.dtype.plc_type,
|
|
618
689
|
ref_column.size(),
|
|
619
690
|
None,
|
|
620
691
|
None,
|
|
@@ -628,9 +699,11 @@ class StringFunction(Expr):
|
|
|
628
699
|
StringFunction.Name.StripSuffix,
|
|
629
700
|
}:
|
|
630
701
|
child, expr = self.children
|
|
631
|
-
|
|
702
|
+
plc_column = child.evaluate(df, context=context).obj
|
|
632
703
|
assert isinstance(expr, Literal)
|
|
633
|
-
target = plc.Scalar.from_py(
|
|
704
|
+
target = plc.Scalar.from_py(
|
|
705
|
+
expr.value, expr.dtype.plc_type, stream=df.stream
|
|
706
|
+
)
|
|
634
707
|
if self.name == StringFunction.Name.StripPrefix:
|
|
635
708
|
find = plc.strings.find.starts_with
|
|
636
709
|
start = len(expr.value)
|
|
@@ -640,17 +713,23 @@ class StringFunction(Expr):
|
|
|
640
713
|
start = 0
|
|
641
714
|
end = -len(expr.value)
|
|
642
715
|
|
|
643
|
-
mask = find(
|
|
716
|
+
mask = find(plc_column, target, stream=df.stream)
|
|
644
717
|
sliced = plc.strings.slice.slice_strings(
|
|
645
|
-
|
|
646
|
-
plc.Scalar.from_py(
|
|
647
|
-
|
|
718
|
+
plc_column,
|
|
719
|
+
plc.Scalar.from_py(
|
|
720
|
+
start, plc.DataType(plc.TypeId.INT32), stream=df.stream
|
|
721
|
+
),
|
|
722
|
+
plc.Scalar.from_py(
|
|
723
|
+
end, plc.DataType(plc.TypeId.INT32), stream=df.stream
|
|
724
|
+
),
|
|
725
|
+
stream=df.stream,
|
|
648
726
|
)
|
|
649
727
|
return Column(
|
|
650
728
|
plc.copying.copy_if_else(
|
|
651
729
|
sliced,
|
|
652
|
-
|
|
730
|
+
plc_column,
|
|
653
731
|
mask,
|
|
732
|
+
stream=df.stream,
|
|
654
733
|
),
|
|
655
734
|
dtype=self.dtype,
|
|
656
735
|
)
|
|
@@ -667,7 +746,12 @@ class StringFunction(Expr):
|
|
|
667
746
|
else:
|
|
668
747
|
side = plc.strings.SideType.BOTH
|
|
669
748
|
return Column(
|
|
670
|
-
plc.strings.strip.strip(
|
|
749
|
+
plc.strings.strip.strip(
|
|
750
|
+
column.obj,
|
|
751
|
+
side,
|
|
752
|
+
chars.obj_scalar(stream=df.stream),
|
|
753
|
+
stream=df.stream,
|
|
754
|
+
),
|
|
671
755
|
dtype=self.dtype,
|
|
672
756
|
)
|
|
673
757
|
|
|
@@ -678,15 +762,17 @@ class StringFunction(Expr):
|
|
|
678
762
|
if self.children[1].value is None:
|
|
679
763
|
return Column(
|
|
680
764
|
plc.Column.from_scalar(
|
|
681
|
-
plc.Scalar.from_py(None, self.dtype.
|
|
765
|
+
plc.Scalar.from_py(None, self.dtype.plc_type, stream=df.stream),
|
|
682
766
|
column.size,
|
|
767
|
+
stream=df.stream,
|
|
683
768
|
),
|
|
684
769
|
self.dtype,
|
|
685
770
|
)
|
|
686
771
|
elif self.children[1].value == 0:
|
|
687
772
|
result = plc.Column.from_scalar(
|
|
688
|
-
plc.Scalar.from_py("", self.dtype.
|
|
773
|
+
plc.Scalar.from_py("", self.dtype.plc_type, stream=df.stream),
|
|
689
774
|
column.size,
|
|
775
|
+
stream=df.stream,
|
|
690
776
|
)
|
|
691
777
|
if column.obj.null_mask():
|
|
692
778
|
result = result.with_mask(
|
|
@@ -700,9 +786,14 @@ class StringFunction(Expr):
|
|
|
700
786
|
return Column(
|
|
701
787
|
plc.strings.slice.slice_strings(
|
|
702
788
|
column.obj,
|
|
703
|
-
plc.Scalar.from_py(
|
|
704
|
-
|
|
789
|
+
plc.Scalar.from_py(
|
|
790
|
+
start, plc.DataType(plc.TypeId.INT32), stream=df.stream
|
|
791
|
+
),
|
|
792
|
+
plc.Scalar.from_py(
|
|
793
|
+
end, plc.DataType(plc.TypeId.INT32), stream=df.stream
|
|
794
|
+
),
|
|
705
795
|
None,
|
|
796
|
+
stream=df.stream,
|
|
706
797
|
),
|
|
707
798
|
self.dtype,
|
|
708
799
|
)
|
|
@@ -715,16 +806,22 @@ class StringFunction(Expr):
|
|
|
715
806
|
if end is None:
|
|
716
807
|
return Column(
|
|
717
808
|
plc.Column.from_scalar(
|
|
718
|
-
plc.Scalar.from_py(None, self.dtype.
|
|
809
|
+
plc.Scalar.from_py(None, self.dtype.plc_type, stream=df.stream),
|
|
719
810
|
column.size,
|
|
811
|
+
stream=df.stream,
|
|
720
812
|
),
|
|
721
813
|
self.dtype,
|
|
722
814
|
)
|
|
723
815
|
return Column(
|
|
724
816
|
plc.strings.slice.slice_strings(
|
|
725
817
|
column.obj,
|
|
726
|
-
plc.Scalar.from_py(
|
|
727
|
-
|
|
818
|
+
plc.Scalar.from_py(
|
|
819
|
+
0, plc.DataType(plc.TypeId.INT32), stream=df.stream
|
|
820
|
+
),
|
|
821
|
+
plc.Scalar.from_py(
|
|
822
|
+
end, plc.DataType(plc.TypeId.INT32), stream=df.stream
|
|
823
|
+
),
|
|
824
|
+
stream=df.stream,
|
|
728
825
|
),
|
|
729
826
|
self.dtype,
|
|
730
827
|
)
|
|
@@ -732,18 +829,25 @@ class StringFunction(Expr):
|
|
|
732
829
|
columns = [child.evaluate(df, context=context) for child in self.children]
|
|
733
830
|
if self.name is StringFunction.Name.Lowercase:
|
|
734
831
|
(column,) = columns
|
|
735
|
-
return Column(
|
|
832
|
+
return Column(
|
|
833
|
+
plc.strings.case.to_lower(column.obj, stream=df.stream),
|
|
834
|
+
dtype=self.dtype,
|
|
835
|
+
)
|
|
736
836
|
elif self.name is StringFunction.Name.Uppercase:
|
|
737
837
|
(column,) = columns
|
|
738
|
-
return Column(
|
|
838
|
+
return Column(
|
|
839
|
+
plc.strings.case.to_upper(column.obj, stream=df.stream),
|
|
840
|
+
dtype=self.dtype,
|
|
841
|
+
)
|
|
739
842
|
elif self.name is StringFunction.Name.EndsWith:
|
|
740
843
|
column, suffix = columns
|
|
741
844
|
return Column(
|
|
742
845
|
plc.strings.find.ends_with(
|
|
743
846
|
column.obj,
|
|
744
|
-
suffix.obj_scalar
|
|
847
|
+
suffix.obj_scalar(stream=df.stream)
|
|
745
848
|
if column.size != suffix.size and suffix.is_scalar
|
|
746
849
|
else suffix.obj,
|
|
850
|
+
stream=df.stream,
|
|
747
851
|
),
|
|
748
852
|
dtype=self.dtype,
|
|
749
853
|
)
|
|
@@ -752,9 +856,10 @@ class StringFunction(Expr):
|
|
|
752
856
|
return Column(
|
|
753
857
|
plc.strings.find.starts_with(
|
|
754
858
|
column.obj,
|
|
755
|
-
prefix.obj_scalar
|
|
859
|
+
prefix.obj_scalar(stream=df.stream)
|
|
756
860
|
if column.size != prefix.size and prefix.is_scalar
|
|
757
861
|
else prefix.obj,
|
|
862
|
+
stream=df.stream,
|
|
758
863
|
),
|
|
759
864
|
dtype=self.dtype,
|
|
760
865
|
)
|
|
@@ -766,67 +871,80 @@ class StringFunction(Expr):
|
|
|
766
871
|
if plc_col.null_count() == plc_col.size():
|
|
767
872
|
return Column(
|
|
768
873
|
plc.Column.from_scalar(
|
|
769
|
-
plc.Scalar.from_py(None, self.dtype.
|
|
874
|
+
plc.Scalar.from_py(None, self.dtype.plc_type, stream=df.stream),
|
|
770
875
|
plc_col.size(),
|
|
876
|
+
stream=df.stream,
|
|
771
877
|
),
|
|
772
878
|
self.dtype,
|
|
773
879
|
)
|
|
774
880
|
if format is None:
|
|
775
881
|
# Polars begins inference with the first non null value
|
|
776
882
|
if plc_col.null_mask() is not None:
|
|
777
|
-
boolmask = plc.unary.is_valid(plc_col)
|
|
883
|
+
boolmask = plc.unary.is_valid(plc_col, stream=df.stream)
|
|
778
884
|
table = plc.stream_compaction.apply_boolean_mask(
|
|
779
|
-
plc.Table([plc_col]), boolmask
|
|
885
|
+
plc.Table([plc_col]), boolmask, stream=df.stream
|
|
780
886
|
)
|
|
781
887
|
filtered = table.columns()[0]
|
|
782
|
-
first_valid_data = plc.copying.get_element(
|
|
888
|
+
first_valid_data = plc.copying.get_element(
|
|
889
|
+
filtered, 0, stream=df.stream
|
|
890
|
+
).to_py()
|
|
783
891
|
else:
|
|
784
|
-
first_valid_data = plc.copying.get_element(
|
|
892
|
+
first_valid_data = plc.copying.get_element(
|
|
893
|
+
plc_col, 0, stream=df.stream
|
|
894
|
+
).to_py()
|
|
785
895
|
|
|
786
|
-
|
|
896
|
+
# See https://github.com/rapidsai/cudf/issues/20202 for we type ignore
|
|
897
|
+
format = _infer_datetime_format(first_valid_data) # type: ignore[arg-type]
|
|
787
898
|
if not format:
|
|
788
899
|
raise InvalidOperationError(
|
|
789
900
|
"Unable to infer datetime format from data"
|
|
790
901
|
)
|
|
791
902
|
|
|
792
903
|
is_timestamps = plc.strings.convert.convert_datetime.is_timestamp(
|
|
793
|
-
plc_col, format
|
|
904
|
+
plc_col, format, stream=df.stream
|
|
794
905
|
)
|
|
795
906
|
if strict:
|
|
796
907
|
if not plc.reduce.reduce(
|
|
797
908
|
is_timestamps,
|
|
798
909
|
plc.aggregation.all(),
|
|
799
910
|
plc.DataType(plc.TypeId.BOOL8),
|
|
911
|
+
stream=df.stream,
|
|
800
912
|
).to_py():
|
|
801
913
|
raise InvalidOperationError("conversion from `str` failed.")
|
|
802
914
|
else:
|
|
803
915
|
not_timestamps = plc.unary.unary_operation(
|
|
804
|
-
is_timestamps, plc.unary.UnaryOperator.NOT
|
|
916
|
+
is_timestamps, plc.unary.UnaryOperator.NOT, stream=df.stream
|
|
805
917
|
)
|
|
806
|
-
null = plc.Scalar.from_py(None, plc_col.type())
|
|
918
|
+
null = plc.Scalar.from_py(None, plc_col.type(), stream=df.stream)
|
|
807
919
|
plc_col = plc.copying.boolean_mask_scatter(
|
|
808
|
-
[null], plc.Table([plc_col]), not_timestamps
|
|
920
|
+
[null], plc.Table([plc_col]), not_timestamps, stream=df.stream
|
|
809
921
|
).columns()[0]
|
|
810
922
|
|
|
811
923
|
return Column(
|
|
812
924
|
plc.strings.convert.convert_datetime.to_timestamps(
|
|
813
|
-
plc_col, self.dtype.
|
|
925
|
+
plc_col, self.dtype.plc_type, format, stream=df.stream
|
|
814
926
|
),
|
|
815
927
|
dtype=self.dtype,
|
|
816
928
|
)
|
|
817
929
|
elif self.name is StringFunction.Name.Replace:
|
|
818
|
-
|
|
930
|
+
col_column, col_target, col_repl = columns
|
|
819
931
|
n, _ = self.options
|
|
820
932
|
return Column(
|
|
821
933
|
plc.strings.replace.replace(
|
|
822
|
-
|
|
934
|
+
col_column.obj,
|
|
935
|
+
col_target.obj_scalar(stream=df.stream),
|
|
936
|
+
col_repl.obj_scalar(stream=df.stream),
|
|
937
|
+
maxrepl=n,
|
|
938
|
+
stream=df.stream,
|
|
823
939
|
),
|
|
824
940
|
dtype=self.dtype,
|
|
825
941
|
)
|
|
826
942
|
elif self.name is StringFunction.Name.ReplaceMany:
|
|
827
|
-
|
|
943
|
+
col_column, col_target, col_repl = columns
|
|
828
944
|
return Column(
|
|
829
|
-
plc.strings.replace.replace_multiple(
|
|
945
|
+
plc.strings.replace.replace_multiple(
|
|
946
|
+
col_column.obj, col_target.obj, col_repl.obj, stream=df.stream
|
|
947
|
+
),
|
|
830
948
|
dtype=self.dtype,
|
|
831
949
|
)
|
|
832
950
|
elif self.name is StringFunction.Name.PadStart:
|
|
@@ -838,10 +956,15 @@ class StringFunction(Expr):
|
|
|
838
956
|
(char,) = self.options
|
|
839
957
|
# TODO: Maybe accept a string scalar in
|
|
840
958
|
# cudf::strings::pad to avoid DtoH transfer
|
|
841
|
-
|
|
959
|
+
# See https://github.com/rapidsai/cudf/issues/20202 for we type ignore
|
|
960
|
+
width: int = width_col.obj.to_scalar(stream=df.stream).to_py() # type: ignore[no-redef]
|
|
842
961
|
return Column(
|
|
843
962
|
plc.strings.padding.pad(
|
|
844
|
-
column.obj,
|
|
963
|
+
column.obj,
|
|
964
|
+
width, # type: ignore[arg-type]
|
|
965
|
+
plc.strings.SideType.LEFT,
|
|
966
|
+
char,
|
|
967
|
+
stream=df.stream,
|
|
845
968
|
),
|
|
846
969
|
dtype=self.dtype,
|
|
847
970
|
)
|
|
@@ -854,19 +977,29 @@ class StringFunction(Expr):
|
|
|
854
977
|
(char,) = self.options
|
|
855
978
|
# TODO: Maybe accept a string scalar in
|
|
856
979
|
# cudf::strings::pad to avoid DtoH transfer
|
|
857
|
-
width = width_col.obj.to_scalar().to_py()
|
|
980
|
+
width: int = width_col.obj.to_scalar(stream=df.stream).to_py() # type: ignore[no-redef]
|
|
858
981
|
return Column(
|
|
859
982
|
plc.strings.padding.pad(
|
|
860
|
-
column.obj,
|
|
983
|
+
column.obj,
|
|
984
|
+
width, # type: ignore[arg-type]
|
|
985
|
+
plc.strings.SideType.RIGHT,
|
|
986
|
+
char,
|
|
987
|
+
stream=df.stream,
|
|
861
988
|
),
|
|
862
989
|
dtype=self.dtype,
|
|
863
990
|
)
|
|
864
991
|
elif self.name is StringFunction.Name.Reverse:
|
|
865
992
|
(column,) = columns
|
|
866
|
-
return Column(
|
|
993
|
+
return Column(
|
|
994
|
+
plc.strings.reverse.reverse(column.obj, stream=df.stream),
|
|
995
|
+
dtype=self.dtype,
|
|
996
|
+
)
|
|
867
997
|
elif self.name is StringFunction.Name.Titlecase:
|
|
868
998
|
(column,) = columns
|
|
869
|
-
return Column(
|
|
999
|
+
return Column(
|
|
1000
|
+
plc.strings.capitalize.title(column.obj, stream=df.stream),
|
|
1001
|
+
dtype=self.dtype,
|
|
1002
|
+
)
|
|
870
1003
|
raise NotImplementedError(
|
|
871
1004
|
f"StringFunction {self.name}"
|
|
872
1005
|
) # pragma: no cover; handled by init raising
|