cudf-polars-cu13 25.10.0__py3-none-any.whl → 26.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. cudf_polars/GIT_COMMIT +1 -1
  2. cudf_polars/VERSION +1 -1
  3. cudf_polars/callback.py +60 -15
  4. cudf_polars/containers/column.py +137 -77
  5. cudf_polars/containers/dataframe.py +123 -34
  6. cudf_polars/containers/datatype.py +134 -13
  7. cudf_polars/dsl/expr.py +0 -2
  8. cudf_polars/dsl/expressions/aggregation.py +80 -28
  9. cudf_polars/dsl/expressions/binaryop.py +34 -14
  10. cudf_polars/dsl/expressions/boolean.py +110 -37
  11. cudf_polars/dsl/expressions/datetime.py +59 -30
  12. cudf_polars/dsl/expressions/literal.py +11 -5
  13. cudf_polars/dsl/expressions/rolling.py +460 -119
  14. cudf_polars/dsl/expressions/selection.py +9 -8
  15. cudf_polars/dsl/expressions/slicing.py +1 -1
  16. cudf_polars/dsl/expressions/string.py +256 -114
  17. cudf_polars/dsl/expressions/struct.py +19 -7
  18. cudf_polars/dsl/expressions/ternary.py +33 -3
  19. cudf_polars/dsl/expressions/unary.py +126 -64
  20. cudf_polars/dsl/ir.py +1053 -350
  21. cudf_polars/dsl/to_ast.py +30 -13
  22. cudf_polars/dsl/tracing.py +194 -0
  23. cudf_polars/dsl/translate.py +307 -107
  24. cudf_polars/dsl/utils/aggregations.py +43 -30
  25. cudf_polars/dsl/utils/reshape.py +14 -2
  26. cudf_polars/dsl/utils/rolling.py +12 -8
  27. cudf_polars/dsl/utils/windows.py +35 -20
  28. cudf_polars/experimental/base.py +55 -2
  29. cudf_polars/experimental/benchmarks/pdsds.py +12 -126
  30. cudf_polars/experimental/benchmarks/pdsh.py +792 -2
  31. cudf_polars/experimental/benchmarks/utils.py +596 -39
  32. cudf_polars/experimental/dask_registers.py +47 -20
  33. cudf_polars/experimental/dispatch.py +9 -3
  34. cudf_polars/experimental/distinct.py +2 -0
  35. cudf_polars/experimental/explain.py +15 -2
  36. cudf_polars/experimental/expressions.py +30 -15
  37. cudf_polars/experimental/groupby.py +25 -4
  38. cudf_polars/experimental/io.py +156 -124
  39. cudf_polars/experimental/join.py +53 -23
  40. cudf_polars/experimental/parallel.py +68 -19
  41. cudf_polars/experimental/rapidsmpf/__init__.py +8 -0
  42. cudf_polars/experimental/rapidsmpf/collectives/__init__.py +9 -0
  43. cudf_polars/experimental/rapidsmpf/collectives/allgather.py +90 -0
  44. cudf_polars/experimental/rapidsmpf/collectives/common.py +96 -0
  45. cudf_polars/experimental/rapidsmpf/collectives/shuffle.py +253 -0
  46. cudf_polars/experimental/rapidsmpf/core.py +488 -0
  47. cudf_polars/experimental/rapidsmpf/dask.py +172 -0
  48. cudf_polars/experimental/rapidsmpf/dispatch.py +153 -0
  49. cudf_polars/experimental/rapidsmpf/io.py +696 -0
  50. cudf_polars/experimental/rapidsmpf/join.py +322 -0
  51. cudf_polars/experimental/rapidsmpf/lower.py +74 -0
  52. cudf_polars/experimental/rapidsmpf/nodes.py +735 -0
  53. cudf_polars/experimental/rapidsmpf/repartition.py +216 -0
  54. cudf_polars/experimental/rapidsmpf/union.py +115 -0
  55. cudf_polars/experimental/rapidsmpf/utils.py +374 -0
  56. cudf_polars/experimental/repartition.py +9 -2
  57. cudf_polars/experimental/select.py +177 -14
  58. cudf_polars/experimental/shuffle.py +46 -12
  59. cudf_polars/experimental/sort.py +100 -26
  60. cudf_polars/experimental/spilling.py +1 -1
  61. cudf_polars/experimental/statistics.py +24 -5
  62. cudf_polars/experimental/utils.py +25 -7
  63. cudf_polars/testing/asserts.py +13 -8
  64. cudf_polars/testing/io.py +2 -1
  65. cudf_polars/testing/plugin.py +93 -17
  66. cudf_polars/typing/__init__.py +86 -32
  67. cudf_polars/utils/config.py +473 -58
  68. cudf_polars/utils/cuda_stream.py +70 -0
  69. cudf_polars/utils/versions.py +5 -4
  70. cudf_polars_cu13-26.2.0.dist-info/METADATA +181 -0
  71. cudf_polars_cu13-26.2.0.dist-info/RECORD +108 -0
  72. {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/WHEEL +1 -1
  73. cudf_polars_cu13-25.10.0.dist-info/METADATA +0 -136
  74. cudf_polars_cu13-25.10.0.dist-info/RECORD +0 -92
  75. {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/licenses/LICENSE +0 -0
  76. {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/top_level.txt +0 -0
@@ -10,10 +10,10 @@ import functools
10
10
  import re
11
11
  from datetime import datetime
12
12
  from enum import IntEnum, auto
13
- from typing import TYPE_CHECKING, Any, ClassVar
13
+ from typing import TYPE_CHECKING, Any, ClassVar, cast
14
14
 
15
+ from polars import Struct as pl_Struct, polars # type: ignore[attr-defined]
15
16
  from polars.exceptions import InvalidOperationError
16
- from polars.polars import dtype_str_repr
17
17
 
18
18
  import pylibcudf as plc
19
19
 
@@ -26,8 +26,6 @@ from cudf_polars.utils.versions import POLARS_VERSION_LT_132
26
26
  if TYPE_CHECKING:
27
27
  from typing_extensions import Self
28
28
 
29
- from polars.polars import _expr_nodes as pl_expr
30
-
31
29
  from cudf_polars.containers import DataFrame, DataType
32
30
 
33
31
  __all__ = ["StringFunction"]
@@ -37,10 +35,15 @@ JsonDecodeType = list[tuple[str, plc.DataType, "JsonDecodeType"]]
37
35
 
38
36
  def _dtypes_for_json_decode(dtype: DataType) -> JsonDecodeType:
39
37
  """Get the dtypes for json decode."""
38
+ # Type checker doesn't narrow polars_type through dtype.id() check
40
39
  if dtype.id() == plc.TypeId.STRUCT:
41
40
  return [
42
- (field.name, child.plc, _dtypes_for_json_decode(child))
43
- for field, child in zip(dtype.polars.fields, dtype.children, strict=True)
41
+ (field.name, child.plc_type, _dtypes_for_json_decode(child))
42
+ for field, child in zip(
43
+ cast(pl_Struct, dtype.polars_type).fields,
44
+ dtype.children,
45
+ strict=True,
46
+ )
44
47
  ]
45
48
  else:
46
49
  return []
@@ -96,7 +99,7 @@ class StringFunction(Expr):
96
99
  ZFill = auto()
97
100
 
98
101
  @classmethod
99
- def from_polars(cls, obj: pl_expr.StringFunction) -> Self:
102
+ def from_polars(cls, obj: polars._expr_nodes.StringFunction) -> Self:
100
103
  """Convert from polars' `StringFunction`."""
101
104
  try:
102
105
  function, name = str(obj).split(".", maxsplit=1)
@@ -278,7 +281,7 @@ class StringFunction(Expr):
278
281
  and width.value is not None
279
282
  and width.value < 0
280
283
  ): # pragma: no cover
281
- dtypestr = dtype_str_repr(width.dtype.polars)
284
+ dtypestr = polars.dtype_str_repr(width.dtype.polars_type)
282
285
  raise InvalidOperationError(
283
286
  f"conversion from `{dtypestr}` to `u64` "
284
287
  f"failed in column 'literal' for 1 out of "
@@ -310,14 +313,17 @@ class StringFunction(Expr):
310
313
  columns = [
311
314
  Column(
312
315
  child.evaluate(df, context=context).obj, dtype=child.dtype
313
- ).astype(self.dtype)
316
+ ).astype(self.dtype, stream=df.stream)
314
317
  for child in self.children
315
318
  ]
319
+ if len(columns) == 1:
320
+ return columns[0]
316
321
 
317
322
  non_unit_sizes = [c.size for c in columns if c.size != 1]
318
323
  broadcasted = broadcast(
319
324
  *columns,
320
325
  target_length=max(non_unit_sizes) if non_unit_sizes else None,
326
+ stream=df.stream,
321
327
  )
322
328
 
323
329
  delimiter, ignore_nulls = self.options
@@ -325,24 +331,39 @@ class StringFunction(Expr):
325
331
  return Column(
326
332
  plc.strings.combine.concatenate(
327
333
  plc.Table([col.obj for col in broadcasted]),
328
- plc.Scalar.from_py(delimiter, self.dtype.plc),
329
- None if ignore_nulls else plc.Scalar.from_py(None, self.dtype.plc),
334
+ plc.Scalar.from_py(
335
+ delimiter, self.dtype.plc_type, stream=df.stream
336
+ ),
337
+ None
338
+ if ignore_nulls
339
+ else plc.Scalar.from_py(
340
+ None, self.dtype.plc_type, stream=df.stream
341
+ ),
330
342
  None,
331
343
  plc.strings.combine.SeparatorOnNulls.NO,
344
+ stream=df.stream,
332
345
  ),
333
346
  dtype=self.dtype,
334
347
  )
335
348
  elif self.name is StringFunction.Name.ConcatVertical:
336
349
  (child,) = self.children
337
- column = child.evaluate(df, context=context).astype(self.dtype)
350
+ column = child.evaluate(df, context=context).astype(
351
+ self.dtype, stream=df.stream
352
+ )
338
353
  delimiter, ignore_nulls = self.options
339
354
  if column.null_count > 0 and not ignore_nulls:
340
- return Column(plc.Column.all_null_like(column.obj, 1), dtype=self.dtype)
355
+ return Column(
356
+ plc.Column.all_null_like(column.obj, 1, stream=df.stream),
357
+ dtype=self.dtype,
358
+ )
341
359
  return Column(
342
360
  plc.strings.combine.join_strings(
343
361
  column.obj,
344
- plc.Scalar.from_py(delimiter, self.dtype.plc),
345
- plc.Scalar.from_py(None, self.dtype.plc),
362
+ plc.Scalar.from_py(
363
+ delimiter, self.dtype.plc_type, stream=df.stream
364
+ ),
365
+ plc.Scalar.from_py(None, self.dtype.plc_type, stream=df.stream),
366
+ stream=df.stream,
346
367
  ),
347
368
  dtype=self.dtype,
348
369
  )
@@ -351,19 +372,25 @@ class StringFunction(Expr):
351
372
  # polars pads based on bytes, libcudf by visual width
352
373
  # only pass chars if the visual width matches the byte length
353
374
  column = self.children[0].evaluate(df, context=context)
354
- col_len_bytes = plc.strings.attributes.count_bytes(column.obj)
355
- col_len_chars = plc.strings.attributes.count_characters(column.obj)
375
+ col_len_bytes = plc.strings.attributes.count_bytes(
376
+ column.obj, stream=df.stream
377
+ )
378
+ col_len_chars = plc.strings.attributes.count_characters(
379
+ column.obj, stream=df.stream
380
+ )
356
381
  equal = plc.binaryop.binary_operation(
357
382
  col_len_bytes,
358
383
  col_len_chars,
359
384
  plc.binaryop.BinaryOperator.NULL_EQUALS,
360
385
  plc.DataType(plc.TypeId.BOOL8),
386
+ stream=df.stream,
361
387
  )
362
388
  if not plc.reduce.reduce(
363
389
  equal,
364
390
  plc.aggregation.all(),
365
391
  plc.DataType(plc.TypeId.BOOL8),
366
- ).to_py():
392
+ stream=df.stream,
393
+ ).to_py(stream=df.stream):
367
394
  raise InvalidOperationError(
368
395
  "zfill only supports ascii strings with no unicode characters"
369
396
  )
@@ -373,36 +400,45 @@ class StringFunction(Expr):
373
400
  if width.value is None:
374
401
  return Column(
375
402
  plc.Column.from_scalar(
376
- plc.Scalar.from_py(None, self.dtype.plc),
403
+ plc.Scalar.from_py(
404
+ None, self.dtype.plc_type, stream=df.stream
405
+ ),
377
406
  column.size,
407
+ stream=df.stream,
378
408
  ),
379
409
  self.dtype,
380
410
  )
381
411
  return Column(
382
- plc.strings.padding.zfill(column.obj, width.value), self.dtype
412
+ plc.strings.padding.zfill(
413
+ column.obj, width.value, stream=df.stream
414
+ ),
415
+ self.dtype,
383
416
  )
384
417
  else:
385
418
  col_width = self.children[1].evaluate(df, context=context)
386
419
  assert isinstance(col_width, Column)
387
420
  all_gt_0 = plc.binaryop.binary_operation(
388
421
  col_width.obj,
389
- plc.Scalar.from_py(0, plc.DataType(plc.TypeId.INT64)),
422
+ plc.Scalar.from_py(
423
+ 0, plc.DataType(plc.TypeId.INT64), stream=df.stream
424
+ ),
390
425
  plc.binaryop.BinaryOperator.GREATER_EQUAL,
391
426
  plc.DataType(plc.TypeId.BOOL8),
427
+ stream=df.stream,
392
428
  )
393
429
 
394
- if (
395
- POLARS_VERSION_LT_132
396
- and not plc.reduce.reduce(
397
- all_gt_0,
398
- plc.aggregation.all(),
399
- plc.DataType(plc.TypeId.BOOL8),
400
- ).to_py()
401
- ): # pragma: no cover
430
+ if POLARS_VERSION_LT_132 and not plc.reduce.reduce(
431
+ all_gt_0,
432
+ plc.aggregation.all(),
433
+ plc.DataType(plc.TypeId.BOOL8),
434
+ stream=df.stream,
435
+ ).to_py(stream=df.stream): # pragma: no cover
402
436
  raise InvalidOperationError("fill conversion failed.")
403
437
 
404
438
  return Column(
405
- plc.strings.padding.zfill_by_widths(column.obj, col_width.obj),
439
+ plc.strings.padding.zfill_by_widths(
440
+ column.obj, col_width.obj, stream=df.stream
441
+ ),
406
442
  self.dtype,
407
443
  )
408
444
 
@@ -414,34 +450,39 @@ class StringFunction(Expr):
414
450
  if literal:
415
451
  pat = arg.evaluate(df, context=context)
416
452
  pattern = (
417
- pat.obj_scalar
453
+ pat.obj_scalar(stream=df.stream)
418
454
  if pat.is_scalar and pat.size != column.size
419
455
  else pat.obj
420
456
  )
421
457
  return Column(
422
- plc.strings.find.contains(column.obj, pattern), dtype=self.dtype
458
+ plc.strings.find.contains(column.obj, pattern, stream=df.stream),
459
+ dtype=self.dtype,
423
460
  )
424
461
  else:
425
462
  return Column(
426
- plc.strings.contains.contains_re(column.obj, self._regex_program),
463
+ plc.strings.contains.contains_re(
464
+ column.obj, self._regex_program, stream=df.stream
465
+ ),
427
466
  dtype=self.dtype,
428
467
  )
429
468
  elif self.name is StringFunction.Name.ContainsAny:
430
469
  (ascii_case_insensitive,) = self.options
431
470
  child, arg = self.children
432
- column = child.evaluate(df, context=context).obj
433
- targets = arg.evaluate(df, context=context).obj
471
+ plc_column = child.evaluate(df, context=context).obj
472
+ plc_targets = arg.evaluate(df, context=context).obj
434
473
  if ascii_case_insensitive:
435
- column = plc.strings.case.to_lower(column)
436
- targets = plc.strings.case.to_lower(targets)
474
+ plc_column = plc.strings.case.to_lower(plc_column, stream=df.stream)
475
+ plc_targets = plc.strings.case.to_lower(plc_targets, stream=df.stream)
437
476
  contains = plc.strings.find_multiple.contains_multiple(
438
- column,
439
- targets,
477
+ plc_column,
478
+ plc_targets,
479
+ stream=df.stream,
440
480
  )
441
481
  binary_or = functools.partial(
442
482
  plc.binaryop.binary_operation,
443
483
  op=plc.binaryop.BinaryOperator.BITWISE_OR,
444
- output_type=self.dtype.plc,
484
+ output_type=self.dtype.plc_type,
485
+ stream=df.stream,
445
486
  )
446
487
  return Column(
447
488
  functools.reduce(binary_or, contains.columns()),
@@ -449,28 +490,30 @@ class StringFunction(Expr):
449
490
  )
450
491
  elif self.name is StringFunction.Name.CountMatches:
451
492
  (child, _) = self.children
452
- column = child.evaluate(df, context=context).obj
493
+ plc_column = child.evaluate(df, context=context).obj
453
494
  return Column(
454
495
  plc.unary.cast(
455
- plc.strings.contains.count_re(column, self._regex_program),
456
- self.dtype.plc,
496
+ plc.strings.contains.count_re(
497
+ plc_column, self._regex_program, stream=df.stream
498
+ ),
499
+ self.dtype.plc_type,
500
+ stream=df.stream,
457
501
  ),
458
502
  dtype=self.dtype,
459
503
  )
460
504
  elif self.name is StringFunction.Name.Extract:
461
505
  (group_index,) = self.options
462
- column = self.children[0].evaluate(df, context=context).obj
506
+ plc_column = self.children[0].evaluate(df, context=context).obj
463
507
  return Column(
464
508
  plc.strings.extract.extract_single(
465
- column, self._regex_program, group_index - 1
509
+ plc_column, self._regex_program, group_index - 1, stream=df.stream
466
510
  ),
467
511
  dtype=self.dtype,
468
512
  )
469
513
  elif self.name is StringFunction.Name.ExtractGroups:
470
- column = self.children[0].evaluate(df, context=context).obj
514
+ plc_column = self.children[0].evaluate(df, context=context).obj
471
515
  plc_table = plc.strings.extract.extract(
472
- column,
473
- self._regex_program,
516
+ plc_column, self._regex_program, stream=df.stream
474
517
  )
475
518
  return Column(
476
519
  plc.Column.struct_from_children(plc_table.columns()),
@@ -479,38 +522,45 @@ class StringFunction(Expr):
479
522
  elif self.name is StringFunction.Name.Find:
480
523
  literal, _ = self.options
481
524
  (child, expr) = self.children
482
- column = child.evaluate(df, context=context).obj
525
+ plc_column = child.evaluate(df, context=context).obj
483
526
  if literal:
484
527
  assert isinstance(expr, Literal)
485
528
  plc_column = plc.strings.find.find(
486
- column,
487
- plc.Scalar.from_py(expr.value, expr.dtype.plc),
529
+ plc_column,
530
+ plc.Scalar.from_py(
531
+ expr.value, expr.dtype.plc_type, stream=df.stream
532
+ ),
533
+ stream=df.stream,
488
534
  )
489
535
  else:
490
536
  plc_column = plc.strings.findall.find_re(
491
- column,
492
- self._regex_program,
537
+ plc_column, self._regex_program, stream=df.stream
493
538
  )
494
539
  # Polars returns None for not found, libcudf returns -1
495
540
  new_mask, null_count = plc.transform.bools_to_mask(
496
541
  plc.binaryop.binary_operation(
497
542
  plc_column,
498
- plc.Scalar.from_py(-1, plc_column.type()),
543
+ plc.Scalar.from_py(-1, plc_column.type(), stream=df.stream),
499
544
  plc.binaryop.BinaryOperator.NOT_EQUAL,
500
545
  plc.DataType(plc.TypeId.BOOL8),
501
- )
546
+ stream=df.stream,
547
+ ),
548
+ stream=df.stream,
502
549
  )
503
550
  plc_column = plc.unary.cast(
504
- plc_column.with_mask(new_mask, null_count), self.dtype.plc
551
+ plc_column.with_mask(new_mask, null_count),
552
+ self.dtype.plc_type,
553
+ stream=df.stream,
505
554
  )
506
555
  return Column(plc_column, dtype=self.dtype)
507
556
  elif self.name is StringFunction.Name.JsonDecode:
508
557
  plc_column = self.children[0].evaluate(df, context=context).obj
509
558
  plc_table_with_metadata = plc.io.json.read_json_from_string_column(
510
559
  plc_column,
511
- plc.Scalar.from_py("\n"),
512
- plc.Scalar.from_py("NULL"),
560
+ plc.Scalar.from_py("\n", stream=df.stream),
561
+ plc.Scalar.from_py("NULL", stream=df.stream),
513
562
  _dtypes_for_json_decode(self.dtype),
563
+ stream=df.stream,
514
564
  )
515
565
  return Column(
516
566
  plc.Column.struct_from_children(plc_table_with_metadata.columns),
@@ -518,26 +568,34 @@ class StringFunction(Expr):
518
568
  )
519
569
  elif self.name is StringFunction.Name.JsonPathMatch:
520
570
  (child, expr) = self.children
521
- column = child.evaluate(df, context=context).obj
571
+ plc_column = child.evaluate(df, context=context).obj
522
572
  assert isinstance(expr, Literal)
523
- json_path = plc.Scalar.from_py(expr.value, expr.dtype.plc)
573
+ json_path = plc.Scalar.from_py(
574
+ expr.value, expr.dtype.plc_type, stream=df.stream
575
+ )
524
576
  return Column(
525
- plc.json.get_json_object(column, json_path),
577
+ plc.json.get_json_object(plc_column, json_path, stream=df.stream),
526
578
  dtype=self.dtype,
527
579
  )
528
580
  elif self.name is StringFunction.Name.LenBytes:
529
- column = self.children[0].evaluate(df, context=context).obj
581
+ plc_column = self.children[0].evaluate(df, context=context).obj
530
582
  return Column(
531
583
  plc.unary.cast(
532
- plc.strings.attributes.count_bytes(column), self.dtype.plc
584
+ plc.strings.attributes.count_bytes(plc_column, stream=df.stream),
585
+ self.dtype.plc_type,
586
+ stream=df.stream,
533
587
  ),
534
588
  dtype=self.dtype,
535
589
  )
536
590
  elif self.name is StringFunction.Name.LenChars:
537
- column = self.children[0].evaluate(df, context=context).obj
591
+ plc_column = self.children[0].evaluate(df, context=context).obj
538
592
  return Column(
539
593
  plc.unary.cast(
540
- plc.strings.attributes.count_characters(column), self.dtype.plc
594
+ plc.strings.attributes.count_characters(
595
+ plc_column, stream=df.stream
596
+ ),
597
+ self.dtype.plc_type,
598
+ stream=df.stream,
541
599
  ),
542
600
  dtype=self.dtype,
543
601
  )
@@ -567,8 +625,13 @@ class StringFunction(Expr):
567
625
  return Column(
568
626
  plc.strings.slice.slice_strings(
569
627
  column.obj,
570
- plc.Scalar.from_py(start, plc.DataType(plc.TypeId.INT32)),
571
- plc.Scalar.from_py(stop, plc.DataType(plc.TypeId.INT32)),
628
+ plc.Scalar.from_py(
629
+ start, plc.DataType(plc.TypeId.INT32), stream=df.stream
630
+ ),
631
+ plc.Scalar.from_py(
632
+ stop, plc.DataType(plc.TypeId.INT32), stream=df.stream
633
+ ),
634
+ stream=df.stream,
572
635
  ),
573
636
  dtype=self.dtype,
574
637
  )
@@ -582,7 +645,7 @@ class StringFunction(Expr):
582
645
  column = child.evaluate(df, context=context)
583
646
  if n == 1 and self.name is StringFunction.Name.SplitN:
584
647
  plc_column = plc.Column(
585
- self.dtype.plc,
648
+ self.dtype.plc_type,
586
649
  column.obj.size(),
587
650
  None,
588
651
  None,
@@ -592,7 +655,9 @@ class StringFunction(Expr):
592
655
  )
593
656
  else:
594
657
  assert isinstance(expr, Literal)
595
- by = plc.Scalar.from_py(expr.value, expr.dtype.plc)
658
+ by = plc.Scalar.from_py(
659
+ expr.value, expr.dtype.plc_type, stream=df.stream
660
+ )
596
661
  # See https://github.com/pola-rs/polars/issues/11640
597
662
  # for SplitN vs SplitExact edge case behaviors
598
663
  max_splits = n if is_split_n else 0
@@ -600,13 +665,16 @@ class StringFunction(Expr):
600
665
  column.obj,
601
666
  by,
602
667
  max_splits - 1,
668
+ stream=df.stream,
603
669
  )
604
670
  children = plc_table.columns()
605
671
  ref_column = children[0]
606
672
  if (remainder := n - len(children)) > 0:
607
673
  # Reach expected number of splits by padding with nulls
608
674
  children.extend(
609
- plc.Column.all_null_like(ref_column, ref_column.size())
675
+ plc.Column.all_null_like(
676
+ ref_column, ref_column.size(), stream=df.stream
677
+ )
610
678
  for _ in range(remainder + int(not is_split_n))
611
679
  )
612
680
  if not is_split_n:
@@ -614,7 +682,7 @@ class StringFunction(Expr):
614
682
  # TODO: Use plc.Column.struct_from_children once it is generalized
615
683
  # to handle columns that don't share the same null_mask/null_count
616
684
  plc_column = plc.Column(
617
- self.dtype.plc,
685
+ self.dtype.plc_type,
618
686
  ref_column.size(),
619
687
  None,
620
688
  None,
@@ -628,9 +696,11 @@ class StringFunction(Expr):
628
696
  StringFunction.Name.StripSuffix,
629
697
  }:
630
698
  child, expr = self.children
631
- column = child.evaluate(df, context=context).obj
699
+ plc_column = child.evaluate(df, context=context).obj
632
700
  assert isinstance(expr, Literal)
633
- target = plc.Scalar.from_py(expr.value, expr.dtype.plc)
701
+ target = plc.Scalar.from_py(
702
+ expr.value, expr.dtype.plc_type, stream=df.stream
703
+ )
634
704
  if self.name == StringFunction.Name.StripPrefix:
635
705
  find = plc.strings.find.starts_with
636
706
  start = len(expr.value)
@@ -640,17 +710,23 @@ class StringFunction(Expr):
640
710
  start = 0
641
711
  end = -len(expr.value)
642
712
 
643
- mask = find(column, target)
713
+ mask = find(plc_column, target, stream=df.stream)
644
714
  sliced = plc.strings.slice.slice_strings(
645
- column,
646
- plc.Scalar.from_py(start, plc.DataType(plc.TypeId.INT32)),
647
- plc.Scalar.from_py(end, plc.DataType(plc.TypeId.INT32)),
715
+ plc_column,
716
+ plc.Scalar.from_py(
717
+ start, plc.DataType(plc.TypeId.INT32), stream=df.stream
718
+ ),
719
+ plc.Scalar.from_py(
720
+ end, plc.DataType(plc.TypeId.INT32), stream=df.stream
721
+ ),
722
+ stream=df.stream,
648
723
  )
649
724
  return Column(
650
725
  plc.copying.copy_if_else(
651
726
  sliced,
652
- column,
727
+ plc_column,
653
728
  mask,
729
+ stream=df.stream,
654
730
  ),
655
731
  dtype=self.dtype,
656
732
  )
@@ -667,7 +743,12 @@ class StringFunction(Expr):
667
743
  else:
668
744
  side = plc.strings.SideType.BOTH
669
745
  return Column(
670
- plc.strings.strip.strip(column.obj, side, chars.obj_scalar),
746
+ plc.strings.strip.strip(
747
+ column.obj,
748
+ side,
749
+ chars.obj_scalar(stream=df.stream),
750
+ stream=df.stream,
751
+ ),
671
752
  dtype=self.dtype,
672
753
  )
673
754
 
@@ -678,15 +759,17 @@ class StringFunction(Expr):
678
759
  if self.children[1].value is None:
679
760
  return Column(
680
761
  plc.Column.from_scalar(
681
- plc.Scalar.from_py(None, self.dtype.plc),
762
+ plc.Scalar.from_py(None, self.dtype.plc_type, stream=df.stream),
682
763
  column.size,
764
+ stream=df.stream,
683
765
  ),
684
766
  self.dtype,
685
767
  )
686
768
  elif self.children[1].value == 0:
687
769
  result = plc.Column.from_scalar(
688
- plc.Scalar.from_py("", self.dtype.plc),
770
+ plc.Scalar.from_py("", self.dtype.plc_type, stream=df.stream),
689
771
  column.size,
772
+ stream=df.stream,
690
773
  )
691
774
  if column.obj.null_mask():
692
775
  result = result.with_mask(
@@ -700,9 +783,14 @@ class StringFunction(Expr):
700
783
  return Column(
701
784
  plc.strings.slice.slice_strings(
702
785
  column.obj,
703
- plc.Scalar.from_py(start, plc.DataType(plc.TypeId.INT32)),
704
- plc.Scalar.from_py(end, plc.DataType(plc.TypeId.INT32)),
786
+ plc.Scalar.from_py(
787
+ start, plc.DataType(plc.TypeId.INT32), stream=df.stream
788
+ ),
789
+ plc.Scalar.from_py(
790
+ end, plc.DataType(plc.TypeId.INT32), stream=df.stream
791
+ ),
705
792
  None,
793
+ stream=df.stream,
706
794
  ),
707
795
  self.dtype,
708
796
  )
@@ -715,16 +803,22 @@ class StringFunction(Expr):
715
803
  if end is None:
716
804
  return Column(
717
805
  plc.Column.from_scalar(
718
- plc.Scalar.from_py(None, self.dtype.plc),
806
+ plc.Scalar.from_py(None, self.dtype.plc_type, stream=df.stream),
719
807
  column.size,
808
+ stream=df.stream,
720
809
  ),
721
810
  self.dtype,
722
811
  )
723
812
  return Column(
724
813
  plc.strings.slice.slice_strings(
725
814
  column.obj,
726
- plc.Scalar.from_py(0, plc.DataType(plc.TypeId.INT32)),
727
- plc.Scalar.from_py(end, plc.DataType(plc.TypeId.INT32)),
815
+ plc.Scalar.from_py(
816
+ 0, plc.DataType(plc.TypeId.INT32), stream=df.stream
817
+ ),
818
+ plc.Scalar.from_py(
819
+ end, plc.DataType(plc.TypeId.INT32), stream=df.stream
820
+ ),
821
+ stream=df.stream,
728
822
  ),
729
823
  self.dtype,
730
824
  )
@@ -732,18 +826,25 @@ class StringFunction(Expr):
732
826
  columns = [child.evaluate(df, context=context) for child in self.children]
733
827
  if self.name is StringFunction.Name.Lowercase:
734
828
  (column,) = columns
735
- return Column(plc.strings.case.to_lower(column.obj), dtype=self.dtype)
829
+ return Column(
830
+ plc.strings.case.to_lower(column.obj, stream=df.stream),
831
+ dtype=self.dtype,
832
+ )
736
833
  elif self.name is StringFunction.Name.Uppercase:
737
834
  (column,) = columns
738
- return Column(plc.strings.case.to_upper(column.obj), dtype=self.dtype)
835
+ return Column(
836
+ plc.strings.case.to_upper(column.obj, stream=df.stream),
837
+ dtype=self.dtype,
838
+ )
739
839
  elif self.name is StringFunction.Name.EndsWith:
740
840
  column, suffix = columns
741
841
  return Column(
742
842
  plc.strings.find.ends_with(
743
843
  column.obj,
744
- suffix.obj_scalar
844
+ suffix.obj_scalar(stream=df.stream)
745
845
  if column.size != suffix.size and suffix.is_scalar
746
846
  else suffix.obj,
847
+ stream=df.stream,
747
848
  ),
748
849
  dtype=self.dtype,
749
850
  )
@@ -752,9 +853,10 @@ class StringFunction(Expr):
752
853
  return Column(
753
854
  plc.strings.find.starts_with(
754
855
  column.obj,
755
- prefix.obj_scalar
856
+ prefix.obj_scalar(stream=df.stream)
756
857
  if column.size != prefix.size and prefix.is_scalar
757
858
  else prefix.obj,
859
+ stream=df.stream,
758
860
  ),
759
861
  dtype=self.dtype,
760
862
  )
@@ -766,107 +868,147 @@ class StringFunction(Expr):
766
868
  if plc_col.null_count() == plc_col.size():
767
869
  return Column(
768
870
  plc.Column.from_scalar(
769
- plc.Scalar.from_py(None, self.dtype.plc),
871
+ plc.Scalar.from_py(None, self.dtype.plc_type, stream=df.stream),
770
872
  plc_col.size(),
873
+ stream=df.stream,
771
874
  ),
772
875
  self.dtype,
773
876
  )
774
877
  if format is None:
775
878
  # Polars begins inference with the first non null value
776
879
  if plc_col.null_mask() is not None:
777
- boolmask = plc.unary.is_valid(plc_col)
880
+ boolmask = plc.unary.is_valid(plc_col, stream=df.stream)
778
881
  table = plc.stream_compaction.apply_boolean_mask(
779
- plc.Table([plc_col]), boolmask
882
+ plc.Table([plc_col]), boolmask, stream=df.stream
780
883
  )
781
884
  filtered = table.columns()[0]
782
- first_valid_data = plc.copying.get_element(filtered, 0).to_py()
885
+ first_valid_data = plc.copying.get_element(
886
+ filtered, 0, stream=df.stream
887
+ ).to_py(stream=df.stream)
783
888
  else:
784
- first_valid_data = plc.copying.get_element(plc_col, 0).to_py()
889
+ first_valid_data = plc.copying.get_element(
890
+ plc_col, 0, stream=df.stream
891
+ ).to_py(stream=df.stream)
785
892
 
786
- format = _infer_datetime_format(first_valid_data)
893
+ # See https://github.com/rapidsai/cudf/issues/20202 for we type ignore
894
+ format = _infer_datetime_format(first_valid_data) # type: ignore[arg-type]
787
895
  if not format:
788
896
  raise InvalidOperationError(
789
897
  "Unable to infer datetime format from data"
790
898
  )
791
899
 
792
900
  is_timestamps = plc.strings.convert.convert_datetime.is_timestamp(
793
- plc_col, format
901
+ plc_col, format, stream=df.stream
794
902
  )
795
903
  if strict:
796
904
  if not plc.reduce.reduce(
797
905
  is_timestamps,
798
906
  plc.aggregation.all(),
799
907
  plc.DataType(plc.TypeId.BOOL8),
800
- ).to_py():
908
+ stream=df.stream,
909
+ ).to_py(stream=df.stream):
801
910
  raise InvalidOperationError("conversion from `str` failed.")
802
911
  else:
803
912
  not_timestamps = plc.unary.unary_operation(
804
- is_timestamps, plc.unary.UnaryOperator.NOT
913
+ is_timestamps, plc.unary.UnaryOperator.NOT, stream=df.stream
805
914
  )
806
- null = plc.Scalar.from_py(None, plc_col.type())
915
+ null = plc.Scalar.from_py(None, plc_col.type(), stream=df.stream)
807
916
  plc_col = plc.copying.boolean_mask_scatter(
808
- [null], plc.Table([plc_col]), not_timestamps
917
+ [null], plc.Table([plc_col]), not_timestamps, stream=df.stream
809
918
  ).columns()[0]
810
919
 
811
920
  return Column(
812
921
  plc.strings.convert.convert_datetime.to_timestamps(
813
- plc_col, self.dtype.plc, format
922
+ plc_col, self.dtype.plc_type, format, stream=df.stream
814
923
  ),
815
924
  dtype=self.dtype,
816
925
  )
817
926
  elif self.name is StringFunction.Name.Replace:
818
- column, target, repl = columns
927
+ col_column, col_target, col_repl = columns
819
928
  n, _ = self.options
820
929
  return Column(
821
930
  plc.strings.replace.replace(
822
- column.obj, target.obj_scalar, repl.obj_scalar, maxrepl=n
931
+ col_column.obj,
932
+ col_target.obj_scalar(stream=df.stream),
933
+ col_repl.obj_scalar(stream=df.stream),
934
+ maxrepl=n,
935
+ stream=df.stream,
823
936
  ),
824
937
  dtype=self.dtype,
825
938
  )
826
939
  elif self.name is StringFunction.Name.ReplaceMany:
827
- column, target, repl = columns
940
+ col_column, col_target, col_repl = columns
828
941
  return Column(
829
- plc.strings.replace.replace_multiple(column.obj, target.obj, repl.obj),
942
+ plc.strings.replace.replace_multiple(
943
+ col_column.obj, col_target.obj, col_repl.obj, stream=df.stream
944
+ ),
830
945
  dtype=self.dtype,
831
946
  )
832
947
  elif self.name is StringFunction.Name.PadStart:
833
948
  if POLARS_VERSION_LT_132: # pragma: no cover
834
949
  (column,) = columns
835
- width, char = self.options
950
+ width_arg, char = self.options
951
+ pad_width = cast(int, width_arg)
836
952
  else:
837
953
  (column, width_col) = columns
838
954
  (char,) = self.options
839
955
  # TODO: Maybe accept a string scalar in
840
956
  # cudf::strings::pad to avoid DtoH transfer
841
- width = width_col.obj.to_scalar().to_py()
957
+ # See https://github.com/rapidsai/cudf/issues/20202
958
+ width_py = width_col.obj.to_scalar(stream=df.stream).to_py(
959
+ stream=df.stream
960
+ )
961
+ assert width_py is not None
962
+ pad_width = int(width_py)
963
+
842
964
  return Column(
843
965
  plc.strings.padding.pad(
844
- column.obj, width, plc.strings.SideType.LEFT, char
966
+ column.obj,
967
+ pad_width,
968
+ plc.strings.SideType.LEFT,
969
+ char,
970
+ stream=df.stream,
845
971
  ),
846
972
  dtype=self.dtype,
847
973
  )
848
974
  elif self.name is StringFunction.Name.PadEnd:
849
975
  if POLARS_VERSION_LT_132: # pragma: no cover
850
976
  (column,) = columns
851
- width, char = self.options
977
+ width_arg, char = self.options
978
+ pad_width = cast(int, width_arg)
852
979
  else:
853
980
  (column, width_col) = columns
854
981
  (char,) = self.options
855
982
  # TODO: Maybe accept a string scalar in
856
983
  # cudf::strings::pad to avoid DtoH transfer
857
- width = width_col.obj.to_scalar().to_py()
984
+ width_py = width_col.obj.to_scalar(stream=df.stream).to_py(
985
+ stream=df.stream
986
+ )
987
+ assert width_py is not None
988
+ pad_width = int(width_py)
989
+
858
990
  return Column(
859
991
  plc.strings.padding.pad(
860
- column.obj, width, plc.strings.SideType.RIGHT, char
992
+ column.obj,
993
+ pad_width,
994
+ plc.strings.SideType.RIGHT,
995
+ char,
996
+ stream=df.stream,
861
997
  ),
862
998
  dtype=self.dtype,
863
999
  )
864
1000
  elif self.name is StringFunction.Name.Reverse:
865
1001
  (column,) = columns
866
- return Column(plc.strings.reverse.reverse(column.obj), dtype=self.dtype)
1002
+ return Column(
1003
+ plc.strings.reverse.reverse(column.obj, stream=df.stream),
1004
+ dtype=self.dtype,
1005
+ )
867
1006
  elif self.name is StringFunction.Name.Titlecase:
868
1007
  (column,) = columns
869
- return Column(plc.strings.capitalize.title(column.obj), dtype=self.dtype)
1008
+ return Column(
1009
+ plc.strings.capitalize.title(column.obj, stream=df.stream),
1010
+ dtype=self.dtype,
1011
+ )
870
1012
  raise NotImplementedError(
871
1013
  f"StringFunction {self.name}"
872
1014
  ) # pragma: no cover; handled by init raising