cudf-polars-cu13 25.10.0__py3-none-any.whl → 26.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudf_polars/GIT_COMMIT +1 -1
- cudf_polars/VERSION +1 -1
- cudf_polars/callback.py +60 -15
- cudf_polars/containers/column.py +137 -77
- cudf_polars/containers/dataframe.py +123 -34
- cudf_polars/containers/datatype.py +134 -13
- cudf_polars/dsl/expr.py +0 -2
- cudf_polars/dsl/expressions/aggregation.py +80 -28
- cudf_polars/dsl/expressions/binaryop.py +34 -14
- cudf_polars/dsl/expressions/boolean.py +110 -37
- cudf_polars/dsl/expressions/datetime.py +59 -30
- cudf_polars/dsl/expressions/literal.py +11 -5
- cudf_polars/dsl/expressions/rolling.py +460 -119
- cudf_polars/dsl/expressions/selection.py +9 -8
- cudf_polars/dsl/expressions/slicing.py +1 -1
- cudf_polars/dsl/expressions/string.py +256 -114
- cudf_polars/dsl/expressions/struct.py +19 -7
- cudf_polars/dsl/expressions/ternary.py +33 -3
- cudf_polars/dsl/expressions/unary.py +126 -64
- cudf_polars/dsl/ir.py +1053 -350
- cudf_polars/dsl/to_ast.py +30 -13
- cudf_polars/dsl/tracing.py +194 -0
- cudf_polars/dsl/translate.py +307 -107
- cudf_polars/dsl/utils/aggregations.py +43 -30
- cudf_polars/dsl/utils/reshape.py +14 -2
- cudf_polars/dsl/utils/rolling.py +12 -8
- cudf_polars/dsl/utils/windows.py +35 -20
- cudf_polars/experimental/base.py +55 -2
- cudf_polars/experimental/benchmarks/pdsds.py +12 -126
- cudf_polars/experimental/benchmarks/pdsh.py +792 -2
- cudf_polars/experimental/benchmarks/utils.py +596 -39
- cudf_polars/experimental/dask_registers.py +47 -20
- cudf_polars/experimental/dispatch.py +9 -3
- cudf_polars/experimental/distinct.py +2 -0
- cudf_polars/experimental/explain.py +15 -2
- cudf_polars/experimental/expressions.py +30 -15
- cudf_polars/experimental/groupby.py +25 -4
- cudf_polars/experimental/io.py +156 -124
- cudf_polars/experimental/join.py +53 -23
- cudf_polars/experimental/parallel.py +68 -19
- cudf_polars/experimental/rapidsmpf/__init__.py +8 -0
- cudf_polars/experimental/rapidsmpf/collectives/__init__.py +9 -0
- cudf_polars/experimental/rapidsmpf/collectives/allgather.py +90 -0
- cudf_polars/experimental/rapidsmpf/collectives/common.py +96 -0
- cudf_polars/experimental/rapidsmpf/collectives/shuffle.py +253 -0
- cudf_polars/experimental/rapidsmpf/core.py +488 -0
- cudf_polars/experimental/rapidsmpf/dask.py +172 -0
- cudf_polars/experimental/rapidsmpf/dispatch.py +153 -0
- cudf_polars/experimental/rapidsmpf/io.py +696 -0
- cudf_polars/experimental/rapidsmpf/join.py +322 -0
- cudf_polars/experimental/rapidsmpf/lower.py +74 -0
- cudf_polars/experimental/rapidsmpf/nodes.py +735 -0
- cudf_polars/experimental/rapidsmpf/repartition.py +216 -0
- cudf_polars/experimental/rapidsmpf/union.py +115 -0
- cudf_polars/experimental/rapidsmpf/utils.py +374 -0
- cudf_polars/experimental/repartition.py +9 -2
- cudf_polars/experimental/select.py +177 -14
- cudf_polars/experimental/shuffle.py +46 -12
- cudf_polars/experimental/sort.py +100 -26
- cudf_polars/experimental/spilling.py +1 -1
- cudf_polars/experimental/statistics.py +24 -5
- cudf_polars/experimental/utils.py +25 -7
- cudf_polars/testing/asserts.py +13 -8
- cudf_polars/testing/io.py +2 -1
- cudf_polars/testing/plugin.py +93 -17
- cudf_polars/typing/__init__.py +86 -32
- cudf_polars/utils/config.py +473 -58
- cudf_polars/utils/cuda_stream.py +70 -0
- cudf_polars/utils/versions.py +5 -4
- cudf_polars_cu13-26.2.0.dist-info/METADATA +181 -0
- cudf_polars_cu13-26.2.0.dist-info/RECORD +108 -0
- {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/WHEEL +1 -1
- cudf_polars_cu13-25.10.0.dist-info/METADATA +0 -136
- cudf_polars_cu13-25.10.0.dist-info/RECORD +0 -92
- {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/licenses/LICENSE +0 -0
- {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/top_level.txt +0 -0
|
@@ -10,10 +10,10 @@ import functools
|
|
|
10
10
|
import re
|
|
11
11
|
from datetime import datetime
|
|
12
12
|
from enum import IntEnum, auto
|
|
13
|
-
from typing import TYPE_CHECKING, Any, ClassVar
|
|
13
|
+
from typing import TYPE_CHECKING, Any, ClassVar, cast
|
|
14
14
|
|
|
15
|
+
from polars import Struct as pl_Struct, polars # type: ignore[attr-defined]
|
|
15
16
|
from polars.exceptions import InvalidOperationError
|
|
16
|
-
from polars.polars import dtype_str_repr
|
|
17
17
|
|
|
18
18
|
import pylibcudf as plc
|
|
19
19
|
|
|
@@ -26,8 +26,6 @@ from cudf_polars.utils.versions import POLARS_VERSION_LT_132
|
|
|
26
26
|
if TYPE_CHECKING:
|
|
27
27
|
from typing_extensions import Self
|
|
28
28
|
|
|
29
|
-
from polars.polars import _expr_nodes as pl_expr
|
|
30
|
-
|
|
31
29
|
from cudf_polars.containers import DataFrame, DataType
|
|
32
30
|
|
|
33
31
|
__all__ = ["StringFunction"]
|
|
@@ -37,10 +35,15 @@ JsonDecodeType = list[tuple[str, plc.DataType, "JsonDecodeType"]]
|
|
|
37
35
|
|
|
38
36
|
def _dtypes_for_json_decode(dtype: DataType) -> JsonDecodeType:
|
|
39
37
|
"""Get the dtypes for json decode."""
|
|
38
|
+
# Type checker doesn't narrow polars_type through dtype.id() check
|
|
40
39
|
if dtype.id() == plc.TypeId.STRUCT:
|
|
41
40
|
return [
|
|
42
|
-
(field.name, child.
|
|
43
|
-
for field, child in zip(
|
|
41
|
+
(field.name, child.plc_type, _dtypes_for_json_decode(child))
|
|
42
|
+
for field, child in zip(
|
|
43
|
+
cast(pl_Struct, dtype.polars_type).fields,
|
|
44
|
+
dtype.children,
|
|
45
|
+
strict=True,
|
|
46
|
+
)
|
|
44
47
|
]
|
|
45
48
|
else:
|
|
46
49
|
return []
|
|
@@ -96,7 +99,7 @@ class StringFunction(Expr):
|
|
|
96
99
|
ZFill = auto()
|
|
97
100
|
|
|
98
101
|
@classmethod
|
|
99
|
-
def from_polars(cls, obj:
|
|
102
|
+
def from_polars(cls, obj: polars._expr_nodes.StringFunction) -> Self:
|
|
100
103
|
"""Convert from polars' `StringFunction`."""
|
|
101
104
|
try:
|
|
102
105
|
function, name = str(obj).split(".", maxsplit=1)
|
|
@@ -278,7 +281,7 @@ class StringFunction(Expr):
|
|
|
278
281
|
and width.value is not None
|
|
279
282
|
and width.value < 0
|
|
280
283
|
): # pragma: no cover
|
|
281
|
-
dtypestr = dtype_str_repr(width.dtype.
|
|
284
|
+
dtypestr = polars.dtype_str_repr(width.dtype.polars_type)
|
|
282
285
|
raise InvalidOperationError(
|
|
283
286
|
f"conversion from `{dtypestr}` to `u64` "
|
|
284
287
|
f"failed in column 'literal' for 1 out of "
|
|
@@ -310,14 +313,17 @@ class StringFunction(Expr):
|
|
|
310
313
|
columns = [
|
|
311
314
|
Column(
|
|
312
315
|
child.evaluate(df, context=context).obj, dtype=child.dtype
|
|
313
|
-
).astype(self.dtype)
|
|
316
|
+
).astype(self.dtype, stream=df.stream)
|
|
314
317
|
for child in self.children
|
|
315
318
|
]
|
|
319
|
+
if len(columns) == 1:
|
|
320
|
+
return columns[0]
|
|
316
321
|
|
|
317
322
|
non_unit_sizes = [c.size for c in columns if c.size != 1]
|
|
318
323
|
broadcasted = broadcast(
|
|
319
324
|
*columns,
|
|
320
325
|
target_length=max(non_unit_sizes) if non_unit_sizes else None,
|
|
326
|
+
stream=df.stream,
|
|
321
327
|
)
|
|
322
328
|
|
|
323
329
|
delimiter, ignore_nulls = self.options
|
|
@@ -325,24 +331,39 @@ class StringFunction(Expr):
|
|
|
325
331
|
return Column(
|
|
326
332
|
plc.strings.combine.concatenate(
|
|
327
333
|
plc.Table([col.obj for col in broadcasted]),
|
|
328
|
-
plc.Scalar.from_py(
|
|
329
|
-
|
|
334
|
+
plc.Scalar.from_py(
|
|
335
|
+
delimiter, self.dtype.plc_type, stream=df.stream
|
|
336
|
+
),
|
|
337
|
+
None
|
|
338
|
+
if ignore_nulls
|
|
339
|
+
else plc.Scalar.from_py(
|
|
340
|
+
None, self.dtype.plc_type, stream=df.stream
|
|
341
|
+
),
|
|
330
342
|
None,
|
|
331
343
|
plc.strings.combine.SeparatorOnNulls.NO,
|
|
344
|
+
stream=df.stream,
|
|
332
345
|
),
|
|
333
346
|
dtype=self.dtype,
|
|
334
347
|
)
|
|
335
348
|
elif self.name is StringFunction.Name.ConcatVertical:
|
|
336
349
|
(child,) = self.children
|
|
337
|
-
column = child.evaluate(df, context=context).astype(
|
|
350
|
+
column = child.evaluate(df, context=context).astype(
|
|
351
|
+
self.dtype, stream=df.stream
|
|
352
|
+
)
|
|
338
353
|
delimiter, ignore_nulls = self.options
|
|
339
354
|
if column.null_count > 0 and not ignore_nulls:
|
|
340
|
-
return Column(
|
|
355
|
+
return Column(
|
|
356
|
+
plc.Column.all_null_like(column.obj, 1, stream=df.stream),
|
|
357
|
+
dtype=self.dtype,
|
|
358
|
+
)
|
|
341
359
|
return Column(
|
|
342
360
|
plc.strings.combine.join_strings(
|
|
343
361
|
column.obj,
|
|
344
|
-
plc.Scalar.from_py(
|
|
345
|
-
|
|
362
|
+
plc.Scalar.from_py(
|
|
363
|
+
delimiter, self.dtype.plc_type, stream=df.stream
|
|
364
|
+
),
|
|
365
|
+
plc.Scalar.from_py(None, self.dtype.plc_type, stream=df.stream),
|
|
366
|
+
stream=df.stream,
|
|
346
367
|
),
|
|
347
368
|
dtype=self.dtype,
|
|
348
369
|
)
|
|
@@ -351,19 +372,25 @@ class StringFunction(Expr):
|
|
|
351
372
|
# polars pads based on bytes, libcudf by visual width
|
|
352
373
|
# only pass chars if the visual width matches the byte length
|
|
353
374
|
column = self.children[0].evaluate(df, context=context)
|
|
354
|
-
col_len_bytes = plc.strings.attributes.count_bytes(
|
|
355
|
-
|
|
375
|
+
col_len_bytes = plc.strings.attributes.count_bytes(
|
|
376
|
+
column.obj, stream=df.stream
|
|
377
|
+
)
|
|
378
|
+
col_len_chars = plc.strings.attributes.count_characters(
|
|
379
|
+
column.obj, stream=df.stream
|
|
380
|
+
)
|
|
356
381
|
equal = plc.binaryop.binary_operation(
|
|
357
382
|
col_len_bytes,
|
|
358
383
|
col_len_chars,
|
|
359
384
|
plc.binaryop.BinaryOperator.NULL_EQUALS,
|
|
360
385
|
plc.DataType(plc.TypeId.BOOL8),
|
|
386
|
+
stream=df.stream,
|
|
361
387
|
)
|
|
362
388
|
if not plc.reduce.reduce(
|
|
363
389
|
equal,
|
|
364
390
|
plc.aggregation.all(),
|
|
365
391
|
plc.DataType(plc.TypeId.BOOL8),
|
|
366
|
-
|
|
392
|
+
stream=df.stream,
|
|
393
|
+
).to_py(stream=df.stream):
|
|
367
394
|
raise InvalidOperationError(
|
|
368
395
|
"zfill only supports ascii strings with no unicode characters"
|
|
369
396
|
)
|
|
@@ -373,36 +400,45 @@ class StringFunction(Expr):
|
|
|
373
400
|
if width.value is None:
|
|
374
401
|
return Column(
|
|
375
402
|
plc.Column.from_scalar(
|
|
376
|
-
plc.Scalar.from_py(
|
|
403
|
+
plc.Scalar.from_py(
|
|
404
|
+
None, self.dtype.plc_type, stream=df.stream
|
|
405
|
+
),
|
|
377
406
|
column.size,
|
|
407
|
+
stream=df.stream,
|
|
378
408
|
),
|
|
379
409
|
self.dtype,
|
|
380
410
|
)
|
|
381
411
|
return Column(
|
|
382
|
-
plc.strings.padding.zfill(
|
|
412
|
+
plc.strings.padding.zfill(
|
|
413
|
+
column.obj, width.value, stream=df.stream
|
|
414
|
+
),
|
|
415
|
+
self.dtype,
|
|
383
416
|
)
|
|
384
417
|
else:
|
|
385
418
|
col_width = self.children[1].evaluate(df, context=context)
|
|
386
419
|
assert isinstance(col_width, Column)
|
|
387
420
|
all_gt_0 = plc.binaryop.binary_operation(
|
|
388
421
|
col_width.obj,
|
|
389
|
-
plc.Scalar.from_py(
|
|
422
|
+
plc.Scalar.from_py(
|
|
423
|
+
0, plc.DataType(plc.TypeId.INT64), stream=df.stream
|
|
424
|
+
),
|
|
390
425
|
plc.binaryop.BinaryOperator.GREATER_EQUAL,
|
|
391
426
|
plc.DataType(plc.TypeId.BOOL8),
|
|
427
|
+
stream=df.stream,
|
|
392
428
|
)
|
|
393
429
|
|
|
394
|
-
if (
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
).to_py()
|
|
401
|
-
): # pragma: no cover
|
|
430
|
+
if POLARS_VERSION_LT_132 and not plc.reduce.reduce(
|
|
431
|
+
all_gt_0,
|
|
432
|
+
plc.aggregation.all(),
|
|
433
|
+
plc.DataType(plc.TypeId.BOOL8),
|
|
434
|
+
stream=df.stream,
|
|
435
|
+
).to_py(stream=df.stream): # pragma: no cover
|
|
402
436
|
raise InvalidOperationError("fill conversion failed.")
|
|
403
437
|
|
|
404
438
|
return Column(
|
|
405
|
-
plc.strings.padding.zfill_by_widths(
|
|
439
|
+
plc.strings.padding.zfill_by_widths(
|
|
440
|
+
column.obj, col_width.obj, stream=df.stream
|
|
441
|
+
),
|
|
406
442
|
self.dtype,
|
|
407
443
|
)
|
|
408
444
|
|
|
@@ -414,34 +450,39 @@ class StringFunction(Expr):
|
|
|
414
450
|
if literal:
|
|
415
451
|
pat = arg.evaluate(df, context=context)
|
|
416
452
|
pattern = (
|
|
417
|
-
pat.obj_scalar
|
|
453
|
+
pat.obj_scalar(stream=df.stream)
|
|
418
454
|
if pat.is_scalar and pat.size != column.size
|
|
419
455
|
else pat.obj
|
|
420
456
|
)
|
|
421
457
|
return Column(
|
|
422
|
-
plc.strings.find.contains(column.obj, pattern
|
|
458
|
+
plc.strings.find.contains(column.obj, pattern, stream=df.stream),
|
|
459
|
+
dtype=self.dtype,
|
|
423
460
|
)
|
|
424
461
|
else:
|
|
425
462
|
return Column(
|
|
426
|
-
plc.strings.contains.contains_re(
|
|
463
|
+
plc.strings.contains.contains_re(
|
|
464
|
+
column.obj, self._regex_program, stream=df.stream
|
|
465
|
+
),
|
|
427
466
|
dtype=self.dtype,
|
|
428
467
|
)
|
|
429
468
|
elif self.name is StringFunction.Name.ContainsAny:
|
|
430
469
|
(ascii_case_insensitive,) = self.options
|
|
431
470
|
child, arg = self.children
|
|
432
|
-
|
|
433
|
-
|
|
471
|
+
plc_column = child.evaluate(df, context=context).obj
|
|
472
|
+
plc_targets = arg.evaluate(df, context=context).obj
|
|
434
473
|
if ascii_case_insensitive:
|
|
435
|
-
|
|
436
|
-
|
|
474
|
+
plc_column = plc.strings.case.to_lower(plc_column, stream=df.stream)
|
|
475
|
+
plc_targets = plc.strings.case.to_lower(plc_targets, stream=df.stream)
|
|
437
476
|
contains = plc.strings.find_multiple.contains_multiple(
|
|
438
|
-
|
|
439
|
-
|
|
477
|
+
plc_column,
|
|
478
|
+
plc_targets,
|
|
479
|
+
stream=df.stream,
|
|
440
480
|
)
|
|
441
481
|
binary_or = functools.partial(
|
|
442
482
|
plc.binaryop.binary_operation,
|
|
443
483
|
op=plc.binaryop.BinaryOperator.BITWISE_OR,
|
|
444
|
-
output_type=self.dtype.
|
|
484
|
+
output_type=self.dtype.plc_type,
|
|
485
|
+
stream=df.stream,
|
|
445
486
|
)
|
|
446
487
|
return Column(
|
|
447
488
|
functools.reduce(binary_or, contains.columns()),
|
|
@@ -449,28 +490,30 @@ class StringFunction(Expr):
|
|
|
449
490
|
)
|
|
450
491
|
elif self.name is StringFunction.Name.CountMatches:
|
|
451
492
|
(child, _) = self.children
|
|
452
|
-
|
|
493
|
+
plc_column = child.evaluate(df, context=context).obj
|
|
453
494
|
return Column(
|
|
454
495
|
plc.unary.cast(
|
|
455
|
-
plc.strings.contains.count_re(
|
|
456
|
-
|
|
496
|
+
plc.strings.contains.count_re(
|
|
497
|
+
plc_column, self._regex_program, stream=df.stream
|
|
498
|
+
),
|
|
499
|
+
self.dtype.plc_type,
|
|
500
|
+
stream=df.stream,
|
|
457
501
|
),
|
|
458
502
|
dtype=self.dtype,
|
|
459
503
|
)
|
|
460
504
|
elif self.name is StringFunction.Name.Extract:
|
|
461
505
|
(group_index,) = self.options
|
|
462
|
-
|
|
506
|
+
plc_column = self.children[0].evaluate(df, context=context).obj
|
|
463
507
|
return Column(
|
|
464
508
|
plc.strings.extract.extract_single(
|
|
465
|
-
|
|
509
|
+
plc_column, self._regex_program, group_index - 1, stream=df.stream
|
|
466
510
|
),
|
|
467
511
|
dtype=self.dtype,
|
|
468
512
|
)
|
|
469
513
|
elif self.name is StringFunction.Name.ExtractGroups:
|
|
470
|
-
|
|
514
|
+
plc_column = self.children[0].evaluate(df, context=context).obj
|
|
471
515
|
plc_table = plc.strings.extract.extract(
|
|
472
|
-
|
|
473
|
-
self._regex_program,
|
|
516
|
+
plc_column, self._regex_program, stream=df.stream
|
|
474
517
|
)
|
|
475
518
|
return Column(
|
|
476
519
|
plc.Column.struct_from_children(plc_table.columns()),
|
|
@@ -479,38 +522,45 @@ class StringFunction(Expr):
|
|
|
479
522
|
elif self.name is StringFunction.Name.Find:
|
|
480
523
|
literal, _ = self.options
|
|
481
524
|
(child, expr) = self.children
|
|
482
|
-
|
|
525
|
+
plc_column = child.evaluate(df, context=context).obj
|
|
483
526
|
if literal:
|
|
484
527
|
assert isinstance(expr, Literal)
|
|
485
528
|
plc_column = plc.strings.find.find(
|
|
486
|
-
|
|
487
|
-
plc.Scalar.from_py(
|
|
529
|
+
plc_column,
|
|
530
|
+
plc.Scalar.from_py(
|
|
531
|
+
expr.value, expr.dtype.plc_type, stream=df.stream
|
|
532
|
+
),
|
|
533
|
+
stream=df.stream,
|
|
488
534
|
)
|
|
489
535
|
else:
|
|
490
536
|
plc_column = plc.strings.findall.find_re(
|
|
491
|
-
|
|
492
|
-
self._regex_program,
|
|
537
|
+
plc_column, self._regex_program, stream=df.stream
|
|
493
538
|
)
|
|
494
539
|
# Polars returns None for not found, libcudf returns -1
|
|
495
540
|
new_mask, null_count = plc.transform.bools_to_mask(
|
|
496
541
|
plc.binaryop.binary_operation(
|
|
497
542
|
plc_column,
|
|
498
|
-
plc.Scalar.from_py(-1, plc_column.type()),
|
|
543
|
+
plc.Scalar.from_py(-1, plc_column.type(), stream=df.stream),
|
|
499
544
|
plc.binaryop.BinaryOperator.NOT_EQUAL,
|
|
500
545
|
plc.DataType(plc.TypeId.BOOL8),
|
|
501
|
-
|
|
546
|
+
stream=df.stream,
|
|
547
|
+
),
|
|
548
|
+
stream=df.stream,
|
|
502
549
|
)
|
|
503
550
|
plc_column = plc.unary.cast(
|
|
504
|
-
plc_column.with_mask(new_mask, null_count),
|
|
551
|
+
plc_column.with_mask(new_mask, null_count),
|
|
552
|
+
self.dtype.plc_type,
|
|
553
|
+
stream=df.stream,
|
|
505
554
|
)
|
|
506
555
|
return Column(plc_column, dtype=self.dtype)
|
|
507
556
|
elif self.name is StringFunction.Name.JsonDecode:
|
|
508
557
|
plc_column = self.children[0].evaluate(df, context=context).obj
|
|
509
558
|
plc_table_with_metadata = plc.io.json.read_json_from_string_column(
|
|
510
559
|
plc_column,
|
|
511
|
-
plc.Scalar.from_py("\n"),
|
|
512
|
-
plc.Scalar.from_py("NULL"),
|
|
560
|
+
plc.Scalar.from_py("\n", stream=df.stream),
|
|
561
|
+
plc.Scalar.from_py("NULL", stream=df.stream),
|
|
513
562
|
_dtypes_for_json_decode(self.dtype),
|
|
563
|
+
stream=df.stream,
|
|
514
564
|
)
|
|
515
565
|
return Column(
|
|
516
566
|
plc.Column.struct_from_children(plc_table_with_metadata.columns),
|
|
@@ -518,26 +568,34 @@ class StringFunction(Expr):
|
|
|
518
568
|
)
|
|
519
569
|
elif self.name is StringFunction.Name.JsonPathMatch:
|
|
520
570
|
(child, expr) = self.children
|
|
521
|
-
|
|
571
|
+
plc_column = child.evaluate(df, context=context).obj
|
|
522
572
|
assert isinstance(expr, Literal)
|
|
523
|
-
json_path = plc.Scalar.from_py(
|
|
573
|
+
json_path = plc.Scalar.from_py(
|
|
574
|
+
expr.value, expr.dtype.plc_type, stream=df.stream
|
|
575
|
+
)
|
|
524
576
|
return Column(
|
|
525
|
-
plc.json.get_json_object(
|
|
577
|
+
plc.json.get_json_object(plc_column, json_path, stream=df.stream),
|
|
526
578
|
dtype=self.dtype,
|
|
527
579
|
)
|
|
528
580
|
elif self.name is StringFunction.Name.LenBytes:
|
|
529
|
-
|
|
581
|
+
plc_column = self.children[0].evaluate(df, context=context).obj
|
|
530
582
|
return Column(
|
|
531
583
|
plc.unary.cast(
|
|
532
|
-
plc.strings.attributes.count_bytes(
|
|
584
|
+
plc.strings.attributes.count_bytes(plc_column, stream=df.stream),
|
|
585
|
+
self.dtype.plc_type,
|
|
586
|
+
stream=df.stream,
|
|
533
587
|
),
|
|
534
588
|
dtype=self.dtype,
|
|
535
589
|
)
|
|
536
590
|
elif self.name is StringFunction.Name.LenChars:
|
|
537
|
-
|
|
591
|
+
plc_column = self.children[0].evaluate(df, context=context).obj
|
|
538
592
|
return Column(
|
|
539
593
|
plc.unary.cast(
|
|
540
|
-
plc.strings.attributes.count_characters(
|
|
594
|
+
plc.strings.attributes.count_characters(
|
|
595
|
+
plc_column, stream=df.stream
|
|
596
|
+
),
|
|
597
|
+
self.dtype.plc_type,
|
|
598
|
+
stream=df.stream,
|
|
541
599
|
),
|
|
542
600
|
dtype=self.dtype,
|
|
543
601
|
)
|
|
@@ -567,8 +625,13 @@ class StringFunction(Expr):
|
|
|
567
625
|
return Column(
|
|
568
626
|
plc.strings.slice.slice_strings(
|
|
569
627
|
column.obj,
|
|
570
|
-
plc.Scalar.from_py(
|
|
571
|
-
|
|
628
|
+
plc.Scalar.from_py(
|
|
629
|
+
start, plc.DataType(plc.TypeId.INT32), stream=df.stream
|
|
630
|
+
),
|
|
631
|
+
plc.Scalar.from_py(
|
|
632
|
+
stop, plc.DataType(plc.TypeId.INT32), stream=df.stream
|
|
633
|
+
),
|
|
634
|
+
stream=df.stream,
|
|
572
635
|
),
|
|
573
636
|
dtype=self.dtype,
|
|
574
637
|
)
|
|
@@ -582,7 +645,7 @@ class StringFunction(Expr):
|
|
|
582
645
|
column = child.evaluate(df, context=context)
|
|
583
646
|
if n == 1 and self.name is StringFunction.Name.SplitN:
|
|
584
647
|
plc_column = plc.Column(
|
|
585
|
-
self.dtype.
|
|
648
|
+
self.dtype.plc_type,
|
|
586
649
|
column.obj.size(),
|
|
587
650
|
None,
|
|
588
651
|
None,
|
|
@@ -592,7 +655,9 @@ class StringFunction(Expr):
|
|
|
592
655
|
)
|
|
593
656
|
else:
|
|
594
657
|
assert isinstance(expr, Literal)
|
|
595
|
-
by = plc.Scalar.from_py(
|
|
658
|
+
by = plc.Scalar.from_py(
|
|
659
|
+
expr.value, expr.dtype.plc_type, stream=df.stream
|
|
660
|
+
)
|
|
596
661
|
# See https://github.com/pola-rs/polars/issues/11640
|
|
597
662
|
# for SplitN vs SplitExact edge case behaviors
|
|
598
663
|
max_splits = n if is_split_n else 0
|
|
@@ -600,13 +665,16 @@ class StringFunction(Expr):
|
|
|
600
665
|
column.obj,
|
|
601
666
|
by,
|
|
602
667
|
max_splits - 1,
|
|
668
|
+
stream=df.stream,
|
|
603
669
|
)
|
|
604
670
|
children = plc_table.columns()
|
|
605
671
|
ref_column = children[0]
|
|
606
672
|
if (remainder := n - len(children)) > 0:
|
|
607
673
|
# Reach expected number of splits by padding with nulls
|
|
608
674
|
children.extend(
|
|
609
|
-
plc.Column.all_null_like(
|
|
675
|
+
plc.Column.all_null_like(
|
|
676
|
+
ref_column, ref_column.size(), stream=df.stream
|
|
677
|
+
)
|
|
610
678
|
for _ in range(remainder + int(not is_split_n))
|
|
611
679
|
)
|
|
612
680
|
if not is_split_n:
|
|
@@ -614,7 +682,7 @@ class StringFunction(Expr):
|
|
|
614
682
|
# TODO: Use plc.Column.struct_from_children once it is generalized
|
|
615
683
|
# to handle columns that don't share the same null_mask/null_count
|
|
616
684
|
plc_column = plc.Column(
|
|
617
|
-
self.dtype.
|
|
685
|
+
self.dtype.plc_type,
|
|
618
686
|
ref_column.size(),
|
|
619
687
|
None,
|
|
620
688
|
None,
|
|
@@ -628,9 +696,11 @@ class StringFunction(Expr):
|
|
|
628
696
|
StringFunction.Name.StripSuffix,
|
|
629
697
|
}:
|
|
630
698
|
child, expr = self.children
|
|
631
|
-
|
|
699
|
+
plc_column = child.evaluate(df, context=context).obj
|
|
632
700
|
assert isinstance(expr, Literal)
|
|
633
|
-
target = plc.Scalar.from_py(
|
|
701
|
+
target = plc.Scalar.from_py(
|
|
702
|
+
expr.value, expr.dtype.plc_type, stream=df.stream
|
|
703
|
+
)
|
|
634
704
|
if self.name == StringFunction.Name.StripPrefix:
|
|
635
705
|
find = plc.strings.find.starts_with
|
|
636
706
|
start = len(expr.value)
|
|
@@ -640,17 +710,23 @@ class StringFunction(Expr):
|
|
|
640
710
|
start = 0
|
|
641
711
|
end = -len(expr.value)
|
|
642
712
|
|
|
643
|
-
mask = find(
|
|
713
|
+
mask = find(plc_column, target, stream=df.stream)
|
|
644
714
|
sliced = plc.strings.slice.slice_strings(
|
|
645
|
-
|
|
646
|
-
plc.Scalar.from_py(
|
|
647
|
-
|
|
715
|
+
plc_column,
|
|
716
|
+
plc.Scalar.from_py(
|
|
717
|
+
start, plc.DataType(plc.TypeId.INT32), stream=df.stream
|
|
718
|
+
),
|
|
719
|
+
plc.Scalar.from_py(
|
|
720
|
+
end, plc.DataType(plc.TypeId.INT32), stream=df.stream
|
|
721
|
+
),
|
|
722
|
+
stream=df.stream,
|
|
648
723
|
)
|
|
649
724
|
return Column(
|
|
650
725
|
plc.copying.copy_if_else(
|
|
651
726
|
sliced,
|
|
652
|
-
|
|
727
|
+
plc_column,
|
|
653
728
|
mask,
|
|
729
|
+
stream=df.stream,
|
|
654
730
|
),
|
|
655
731
|
dtype=self.dtype,
|
|
656
732
|
)
|
|
@@ -667,7 +743,12 @@ class StringFunction(Expr):
|
|
|
667
743
|
else:
|
|
668
744
|
side = plc.strings.SideType.BOTH
|
|
669
745
|
return Column(
|
|
670
|
-
plc.strings.strip.strip(
|
|
746
|
+
plc.strings.strip.strip(
|
|
747
|
+
column.obj,
|
|
748
|
+
side,
|
|
749
|
+
chars.obj_scalar(stream=df.stream),
|
|
750
|
+
stream=df.stream,
|
|
751
|
+
),
|
|
671
752
|
dtype=self.dtype,
|
|
672
753
|
)
|
|
673
754
|
|
|
@@ -678,15 +759,17 @@ class StringFunction(Expr):
|
|
|
678
759
|
if self.children[1].value is None:
|
|
679
760
|
return Column(
|
|
680
761
|
plc.Column.from_scalar(
|
|
681
|
-
plc.Scalar.from_py(None, self.dtype.
|
|
762
|
+
plc.Scalar.from_py(None, self.dtype.plc_type, stream=df.stream),
|
|
682
763
|
column.size,
|
|
764
|
+
stream=df.stream,
|
|
683
765
|
),
|
|
684
766
|
self.dtype,
|
|
685
767
|
)
|
|
686
768
|
elif self.children[1].value == 0:
|
|
687
769
|
result = plc.Column.from_scalar(
|
|
688
|
-
plc.Scalar.from_py("", self.dtype.
|
|
770
|
+
plc.Scalar.from_py("", self.dtype.plc_type, stream=df.stream),
|
|
689
771
|
column.size,
|
|
772
|
+
stream=df.stream,
|
|
690
773
|
)
|
|
691
774
|
if column.obj.null_mask():
|
|
692
775
|
result = result.with_mask(
|
|
@@ -700,9 +783,14 @@ class StringFunction(Expr):
|
|
|
700
783
|
return Column(
|
|
701
784
|
plc.strings.slice.slice_strings(
|
|
702
785
|
column.obj,
|
|
703
|
-
plc.Scalar.from_py(
|
|
704
|
-
|
|
786
|
+
plc.Scalar.from_py(
|
|
787
|
+
start, plc.DataType(plc.TypeId.INT32), stream=df.stream
|
|
788
|
+
),
|
|
789
|
+
plc.Scalar.from_py(
|
|
790
|
+
end, plc.DataType(plc.TypeId.INT32), stream=df.stream
|
|
791
|
+
),
|
|
705
792
|
None,
|
|
793
|
+
stream=df.stream,
|
|
706
794
|
),
|
|
707
795
|
self.dtype,
|
|
708
796
|
)
|
|
@@ -715,16 +803,22 @@ class StringFunction(Expr):
|
|
|
715
803
|
if end is None:
|
|
716
804
|
return Column(
|
|
717
805
|
plc.Column.from_scalar(
|
|
718
|
-
plc.Scalar.from_py(None, self.dtype.
|
|
806
|
+
plc.Scalar.from_py(None, self.dtype.plc_type, stream=df.stream),
|
|
719
807
|
column.size,
|
|
808
|
+
stream=df.stream,
|
|
720
809
|
),
|
|
721
810
|
self.dtype,
|
|
722
811
|
)
|
|
723
812
|
return Column(
|
|
724
813
|
plc.strings.slice.slice_strings(
|
|
725
814
|
column.obj,
|
|
726
|
-
plc.Scalar.from_py(
|
|
727
|
-
|
|
815
|
+
plc.Scalar.from_py(
|
|
816
|
+
0, plc.DataType(plc.TypeId.INT32), stream=df.stream
|
|
817
|
+
),
|
|
818
|
+
plc.Scalar.from_py(
|
|
819
|
+
end, plc.DataType(plc.TypeId.INT32), stream=df.stream
|
|
820
|
+
),
|
|
821
|
+
stream=df.stream,
|
|
728
822
|
),
|
|
729
823
|
self.dtype,
|
|
730
824
|
)
|
|
@@ -732,18 +826,25 @@ class StringFunction(Expr):
|
|
|
732
826
|
columns = [child.evaluate(df, context=context) for child in self.children]
|
|
733
827
|
if self.name is StringFunction.Name.Lowercase:
|
|
734
828
|
(column,) = columns
|
|
735
|
-
return Column(
|
|
829
|
+
return Column(
|
|
830
|
+
plc.strings.case.to_lower(column.obj, stream=df.stream),
|
|
831
|
+
dtype=self.dtype,
|
|
832
|
+
)
|
|
736
833
|
elif self.name is StringFunction.Name.Uppercase:
|
|
737
834
|
(column,) = columns
|
|
738
|
-
return Column(
|
|
835
|
+
return Column(
|
|
836
|
+
plc.strings.case.to_upper(column.obj, stream=df.stream),
|
|
837
|
+
dtype=self.dtype,
|
|
838
|
+
)
|
|
739
839
|
elif self.name is StringFunction.Name.EndsWith:
|
|
740
840
|
column, suffix = columns
|
|
741
841
|
return Column(
|
|
742
842
|
plc.strings.find.ends_with(
|
|
743
843
|
column.obj,
|
|
744
|
-
suffix.obj_scalar
|
|
844
|
+
suffix.obj_scalar(stream=df.stream)
|
|
745
845
|
if column.size != suffix.size and suffix.is_scalar
|
|
746
846
|
else suffix.obj,
|
|
847
|
+
stream=df.stream,
|
|
747
848
|
),
|
|
748
849
|
dtype=self.dtype,
|
|
749
850
|
)
|
|
@@ -752,9 +853,10 @@ class StringFunction(Expr):
|
|
|
752
853
|
return Column(
|
|
753
854
|
plc.strings.find.starts_with(
|
|
754
855
|
column.obj,
|
|
755
|
-
prefix.obj_scalar
|
|
856
|
+
prefix.obj_scalar(stream=df.stream)
|
|
756
857
|
if column.size != prefix.size and prefix.is_scalar
|
|
757
858
|
else prefix.obj,
|
|
859
|
+
stream=df.stream,
|
|
758
860
|
),
|
|
759
861
|
dtype=self.dtype,
|
|
760
862
|
)
|
|
@@ -766,107 +868,147 @@ class StringFunction(Expr):
|
|
|
766
868
|
if plc_col.null_count() == plc_col.size():
|
|
767
869
|
return Column(
|
|
768
870
|
plc.Column.from_scalar(
|
|
769
|
-
plc.Scalar.from_py(None, self.dtype.
|
|
871
|
+
plc.Scalar.from_py(None, self.dtype.plc_type, stream=df.stream),
|
|
770
872
|
plc_col.size(),
|
|
873
|
+
stream=df.stream,
|
|
771
874
|
),
|
|
772
875
|
self.dtype,
|
|
773
876
|
)
|
|
774
877
|
if format is None:
|
|
775
878
|
# Polars begins inference with the first non null value
|
|
776
879
|
if plc_col.null_mask() is not None:
|
|
777
|
-
boolmask = plc.unary.is_valid(plc_col)
|
|
880
|
+
boolmask = plc.unary.is_valid(plc_col, stream=df.stream)
|
|
778
881
|
table = plc.stream_compaction.apply_boolean_mask(
|
|
779
|
-
plc.Table([plc_col]), boolmask
|
|
882
|
+
plc.Table([plc_col]), boolmask, stream=df.stream
|
|
780
883
|
)
|
|
781
884
|
filtered = table.columns()[0]
|
|
782
|
-
first_valid_data = plc.copying.get_element(
|
|
885
|
+
first_valid_data = plc.copying.get_element(
|
|
886
|
+
filtered, 0, stream=df.stream
|
|
887
|
+
).to_py(stream=df.stream)
|
|
783
888
|
else:
|
|
784
|
-
first_valid_data = plc.copying.get_element(
|
|
889
|
+
first_valid_data = plc.copying.get_element(
|
|
890
|
+
plc_col, 0, stream=df.stream
|
|
891
|
+
).to_py(stream=df.stream)
|
|
785
892
|
|
|
786
|
-
|
|
893
|
+
# See https://github.com/rapidsai/cudf/issues/20202 for we type ignore
|
|
894
|
+
format = _infer_datetime_format(first_valid_data) # type: ignore[arg-type]
|
|
787
895
|
if not format:
|
|
788
896
|
raise InvalidOperationError(
|
|
789
897
|
"Unable to infer datetime format from data"
|
|
790
898
|
)
|
|
791
899
|
|
|
792
900
|
is_timestamps = plc.strings.convert.convert_datetime.is_timestamp(
|
|
793
|
-
plc_col, format
|
|
901
|
+
plc_col, format, stream=df.stream
|
|
794
902
|
)
|
|
795
903
|
if strict:
|
|
796
904
|
if not plc.reduce.reduce(
|
|
797
905
|
is_timestamps,
|
|
798
906
|
plc.aggregation.all(),
|
|
799
907
|
plc.DataType(plc.TypeId.BOOL8),
|
|
800
|
-
|
|
908
|
+
stream=df.stream,
|
|
909
|
+
).to_py(stream=df.stream):
|
|
801
910
|
raise InvalidOperationError("conversion from `str` failed.")
|
|
802
911
|
else:
|
|
803
912
|
not_timestamps = plc.unary.unary_operation(
|
|
804
|
-
is_timestamps, plc.unary.UnaryOperator.NOT
|
|
913
|
+
is_timestamps, plc.unary.UnaryOperator.NOT, stream=df.stream
|
|
805
914
|
)
|
|
806
|
-
null = plc.Scalar.from_py(None, plc_col.type())
|
|
915
|
+
null = plc.Scalar.from_py(None, plc_col.type(), stream=df.stream)
|
|
807
916
|
plc_col = plc.copying.boolean_mask_scatter(
|
|
808
|
-
[null], plc.Table([plc_col]), not_timestamps
|
|
917
|
+
[null], plc.Table([plc_col]), not_timestamps, stream=df.stream
|
|
809
918
|
).columns()[0]
|
|
810
919
|
|
|
811
920
|
return Column(
|
|
812
921
|
plc.strings.convert.convert_datetime.to_timestamps(
|
|
813
|
-
plc_col, self.dtype.
|
|
922
|
+
plc_col, self.dtype.plc_type, format, stream=df.stream
|
|
814
923
|
),
|
|
815
924
|
dtype=self.dtype,
|
|
816
925
|
)
|
|
817
926
|
elif self.name is StringFunction.Name.Replace:
|
|
818
|
-
|
|
927
|
+
col_column, col_target, col_repl = columns
|
|
819
928
|
n, _ = self.options
|
|
820
929
|
return Column(
|
|
821
930
|
plc.strings.replace.replace(
|
|
822
|
-
|
|
931
|
+
col_column.obj,
|
|
932
|
+
col_target.obj_scalar(stream=df.stream),
|
|
933
|
+
col_repl.obj_scalar(stream=df.stream),
|
|
934
|
+
maxrepl=n,
|
|
935
|
+
stream=df.stream,
|
|
823
936
|
),
|
|
824
937
|
dtype=self.dtype,
|
|
825
938
|
)
|
|
826
939
|
elif self.name is StringFunction.Name.ReplaceMany:
|
|
827
|
-
|
|
940
|
+
col_column, col_target, col_repl = columns
|
|
828
941
|
return Column(
|
|
829
|
-
plc.strings.replace.replace_multiple(
|
|
942
|
+
plc.strings.replace.replace_multiple(
|
|
943
|
+
col_column.obj, col_target.obj, col_repl.obj, stream=df.stream
|
|
944
|
+
),
|
|
830
945
|
dtype=self.dtype,
|
|
831
946
|
)
|
|
832
947
|
elif self.name is StringFunction.Name.PadStart:
|
|
833
948
|
if POLARS_VERSION_LT_132: # pragma: no cover
|
|
834
949
|
(column,) = columns
|
|
835
|
-
|
|
950
|
+
width_arg, char = self.options
|
|
951
|
+
pad_width = cast(int, width_arg)
|
|
836
952
|
else:
|
|
837
953
|
(column, width_col) = columns
|
|
838
954
|
(char,) = self.options
|
|
839
955
|
# TODO: Maybe accept a string scalar in
|
|
840
956
|
# cudf::strings::pad to avoid DtoH transfer
|
|
841
|
-
|
|
957
|
+
# See https://github.com/rapidsai/cudf/issues/20202
|
|
958
|
+
width_py = width_col.obj.to_scalar(stream=df.stream).to_py(
|
|
959
|
+
stream=df.stream
|
|
960
|
+
)
|
|
961
|
+
assert width_py is not None
|
|
962
|
+
pad_width = int(width_py)
|
|
963
|
+
|
|
842
964
|
return Column(
|
|
843
965
|
plc.strings.padding.pad(
|
|
844
|
-
column.obj,
|
|
966
|
+
column.obj,
|
|
967
|
+
pad_width,
|
|
968
|
+
plc.strings.SideType.LEFT,
|
|
969
|
+
char,
|
|
970
|
+
stream=df.stream,
|
|
845
971
|
),
|
|
846
972
|
dtype=self.dtype,
|
|
847
973
|
)
|
|
848
974
|
elif self.name is StringFunction.Name.PadEnd:
|
|
849
975
|
if POLARS_VERSION_LT_132: # pragma: no cover
|
|
850
976
|
(column,) = columns
|
|
851
|
-
|
|
977
|
+
width_arg, char = self.options
|
|
978
|
+
pad_width = cast(int, width_arg)
|
|
852
979
|
else:
|
|
853
980
|
(column, width_col) = columns
|
|
854
981
|
(char,) = self.options
|
|
855
982
|
# TODO: Maybe accept a string scalar in
|
|
856
983
|
# cudf::strings::pad to avoid DtoH transfer
|
|
857
|
-
|
|
984
|
+
width_py = width_col.obj.to_scalar(stream=df.stream).to_py(
|
|
985
|
+
stream=df.stream
|
|
986
|
+
)
|
|
987
|
+
assert width_py is not None
|
|
988
|
+
pad_width = int(width_py)
|
|
989
|
+
|
|
858
990
|
return Column(
|
|
859
991
|
plc.strings.padding.pad(
|
|
860
|
-
column.obj,
|
|
992
|
+
column.obj,
|
|
993
|
+
pad_width,
|
|
994
|
+
plc.strings.SideType.RIGHT,
|
|
995
|
+
char,
|
|
996
|
+
stream=df.stream,
|
|
861
997
|
),
|
|
862
998
|
dtype=self.dtype,
|
|
863
999
|
)
|
|
864
1000
|
elif self.name is StringFunction.Name.Reverse:
|
|
865
1001
|
(column,) = columns
|
|
866
|
-
return Column(
|
|
1002
|
+
return Column(
|
|
1003
|
+
plc.strings.reverse.reverse(column.obj, stream=df.stream),
|
|
1004
|
+
dtype=self.dtype,
|
|
1005
|
+
)
|
|
867
1006
|
elif self.name is StringFunction.Name.Titlecase:
|
|
868
1007
|
(column,) = columns
|
|
869
|
-
return Column(
|
|
1008
|
+
return Column(
|
|
1009
|
+
plc.strings.capitalize.title(column.obj, stream=df.stream),
|
|
1010
|
+
dtype=self.dtype,
|
|
1011
|
+
)
|
|
870
1012
|
raise NotImplementedError(
|
|
871
1013
|
f"StringFunction {self.name}"
|
|
872
1014
|
) # pragma: no cover; handled by init raising
|