cudf-polars-cu13 25.10.0__py3-none-any.whl → 25.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. cudf_polars/GIT_COMMIT +1 -1
  2. cudf_polars/VERSION +1 -1
  3. cudf_polars/callback.py +32 -8
  4. cudf_polars/containers/column.py +94 -59
  5. cudf_polars/containers/dataframe.py +123 -34
  6. cudf_polars/containers/datatype.py +134 -13
  7. cudf_polars/dsl/expr.py +0 -2
  8. cudf_polars/dsl/expressions/aggregation.py +80 -28
  9. cudf_polars/dsl/expressions/binaryop.py +34 -14
  10. cudf_polars/dsl/expressions/boolean.py +110 -37
  11. cudf_polars/dsl/expressions/datetime.py +59 -30
  12. cudf_polars/dsl/expressions/literal.py +11 -5
  13. cudf_polars/dsl/expressions/rolling.py +460 -119
  14. cudf_polars/dsl/expressions/selection.py +9 -8
  15. cudf_polars/dsl/expressions/slicing.py +1 -1
  16. cudf_polars/dsl/expressions/string.py +235 -102
  17. cudf_polars/dsl/expressions/struct.py +19 -7
  18. cudf_polars/dsl/expressions/ternary.py +9 -3
  19. cudf_polars/dsl/expressions/unary.py +117 -58
  20. cudf_polars/dsl/ir.py +923 -290
  21. cudf_polars/dsl/to_ast.py +30 -13
  22. cudf_polars/dsl/tracing.py +194 -0
  23. cudf_polars/dsl/translate.py +294 -97
  24. cudf_polars/dsl/utils/aggregations.py +34 -26
  25. cudf_polars/dsl/utils/reshape.py +14 -2
  26. cudf_polars/dsl/utils/rolling.py +12 -8
  27. cudf_polars/dsl/utils/windows.py +35 -20
  28. cudf_polars/experimental/base.py +45 -2
  29. cudf_polars/experimental/benchmarks/pdsds.py +12 -126
  30. cudf_polars/experimental/benchmarks/pdsh.py +791 -1
  31. cudf_polars/experimental/benchmarks/utils.py +515 -39
  32. cudf_polars/experimental/dask_registers.py +47 -20
  33. cudf_polars/experimental/dispatch.py +9 -3
  34. cudf_polars/experimental/explain.py +15 -2
  35. cudf_polars/experimental/expressions.py +22 -10
  36. cudf_polars/experimental/groupby.py +23 -4
  37. cudf_polars/experimental/io.py +93 -83
  38. cudf_polars/experimental/join.py +39 -22
  39. cudf_polars/experimental/parallel.py +60 -14
  40. cudf_polars/experimental/rapidsmpf/__init__.py +8 -0
  41. cudf_polars/experimental/rapidsmpf/core.py +361 -0
  42. cudf_polars/experimental/rapidsmpf/dispatch.py +150 -0
  43. cudf_polars/experimental/rapidsmpf/io.py +604 -0
  44. cudf_polars/experimental/rapidsmpf/join.py +237 -0
  45. cudf_polars/experimental/rapidsmpf/lower.py +74 -0
  46. cudf_polars/experimental/rapidsmpf/nodes.py +494 -0
  47. cudf_polars/experimental/rapidsmpf/repartition.py +151 -0
  48. cudf_polars/experimental/rapidsmpf/shuffle.py +277 -0
  49. cudf_polars/experimental/rapidsmpf/union.py +96 -0
  50. cudf_polars/experimental/rapidsmpf/utils.py +162 -0
  51. cudf_polars/experimental/repartition.py +9 -2
  52. cudf_polars/experimental/select.py +177 -14
  53. cudf_polars/experimental/shuffle.py +28 -8
  54. cudf_polars/experimental/sort.py +92 -25
  55. cudf_polars/experimental/statistics.py +24 -5
  56. cudf_polars/experimental/utils.py +25 -7
  57. cudf_polars/testing/asserts.py +13 -8
  58. cudf_polars/testing/io.py +2 -1
  59. cudf_polars/testing/plugin.py +88 -15
  60. cudf_polars/typing/__init__.py +86 -32
  61. cudf_polars/utils/config.py +406 -58
  62. cudf_polars/utils/cuda_stream.py +70 -0
  63. cudf_polars/utils/versions.py +3 -2
  64. cudf_polars_cu13-25.12.0.dist-info/METADATA +182 -0
  65. cudf_polars_cu13-25.12.0.dist-info/RECORD +104 -0
  66. cudf_polars_cu13-25.10.0.dist-info/METADATA +0 -136
  67. cudf_polars_cu13-25.10.0.dist-info/RECORD +0 -92
  68. {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-25.12.0.dist-info}/WHEEL +0 -0
  69. {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-25.12.0.dist-info}/licenses/LICENSE +0 -0
  70. {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-25.12.0.dist-info}/top_level.txt +0 -0
@@ -10,10 +10,10 @@ import functools
10
10
  import re
11
11
  from datetime import datetime
12
12
  from enum import IntEnum, auto
13
- from typing import TYPE_CHECKING, Any, ClassVar
13
+ from typing import TYPE_CHECKING, Any, ClassVar, cast
14
14
 
15
+ from polars import Struct as pl_Struct, polars # type: ignore[attr-defined]
15
16
  from polars.exceptions import InvalidOperationError
16
- from polars.polars import dtype_str_repr
17
17
 
18
18
  import pylibcudf as plc
19
19
 
@@ -26,8 +26,6 @@ from cudf_polars.utils.versions import POLARS_VERSION_LT_132
26
26
  if TYPE_CHECKING:
27
27
  from typing_extensions import Self
28
28
 
29
- from polars.polars import _expr_nodes as pl_expr
30
-
31
29
  from cudf_polars.containers import DataFrame, DataType
32
30
 
33
31
  __all__ = ["StringFunction"]
@@ -37,10 +35,15 @@ JsonDecodeType = list[tuple[str, plc.DataType, "JsonDecodeType"]]
37
35
 
38
36
  def _dtypes_for_json_decode(dtype: DataType) -> JsonDecodeType:
39
37
  """Get the dtypes for json decode."""
38
+ # Type checker doesn't narrow polars_type through dtype.id() check
40
39
  if dtype.id() == plc.TypeId.STRUCT:
41
40
  return [
42
- (field.name, child.plc, _dtypes_for_json_decode(child))
43
- for field, child in zip(dtype.polars.fields, dtype.children, strict=True)
41
+ (field.name, child.plc_type, _dtypes_for_json_decode(child))
42
+ for field, child in zip(
43
+ cast(pl_Struct, dtype.polars_type).fields,
44
+ dtype.children,
45
+ strict=True,
46
+ )
44
47
  ]
45
48
  else:
46
49
  return []
@@ -96,7 +99,7 @@ class StringFunction(Expr):
96
99
  ZFill = auto()
97
100
 
98
101
  @classmethod
99
- def from_polars(cls, obj: pl_expr.StringFunction) -> Self:
102
+ def from_polars(cls, obj: polars._expr_nodes.StringFunction) -> Self:
100
103
  """Convert from polars' `StringFunction`."""
101
104
  try:
102
105
  function, name = str(obj).split(".", maxsplit=1)
@@ -278,7 +281,7 @@ class StringFunction(Expr):
278
281
  and width.value is not None
279
282
  and width.value < 0
280
283
  ): # pragma: no cover
281
- dtypestr = dtype_str_repr(width.dtype.polars)
284
+ dtypestr = polars.dtype_str_repr(width.dtype.polars_type)
282
285
  raise InvalidOperationError(
283
286
  f"conversion from `{dtypestr}` to `u64` "
284
287
  f"failed in column 'literal' for 1 out of "
@@ -310,14 +313,17 @@ class StringFunction(Expr):
310
313
  columns = [
311
314
  Column(
312
315
  child.evaluate(df, context=context).obj, dtype=child.dtype
313
- ).astype(self.dtype)
316
+ ).astype(self.dtype, stream=df.stream)
314
317
  for child in self.children
315
318
  ]
319
+ if len(columns) == 1:
320
+ return columns[0]
316
321
 
317
322
  non_unit_sizes = [c.size for c in columns if c.size != 1]
318
323
  broadcasted = broadcast(
319
324
  *columns,
320
325
  target_length=max(non_unit_sizes) if non_unit_sizes else None,
326
+ stream=df.stream,
321
327
  )
322
328
 
323
329
  delimiter, ignore_nulls = self.options
@@ -325,24 +331,39 @@ class StringFunction(Expr):
325
331
  return Column(
326
332
  plc.strings.combine.concatenate(
327
333
  plc.Table([col.obj for col in broadcasted]),
328
- plc.Scalar.from_py(delimiter, self.dtype.plc),
329
- None if ignore_nulls else plc.Scalar.from_py(None, self.dtype.plc),
334
+ plc.Scalar.from_py(
335
+ delimiter, self.dtype.plc_type, stream=df.stream
336
+ ),
337
+ None
338
+ if ignore_nulls
339
+ else plc.Scalar.from_py(
340
+ None, self.dtype.plc_type, stream=df.stream
341
+ ),
330
342
  None,
331
343
  plc.strings.combine.SeparatorOnNulls.NO,
344
+ stream=df.stream,
332
345
  ),
333
346
  dtype=self.dtype,
334
347
  )
335
348
  elif self.name is StringFunction.Name.ConcatVertical:
336
349
  (child,) = self.children
337
- column = child.evaluate(df, context=context).astype(self.dtype)
350
+ column = child.evaluate(df, context=context).astype(
351
+ self.dtype, stream=df.stream
352
+ )
338
353
  delimiter, ignore_nulls = self.options
339
354
  if column.null_count > 0 and not ignore_nulls:
340
- return Column(plc.Column.all_null_like(column.obj, 1), dtype=self.dtype)
355
+ return Column(
356
+ plc.Column.all_null_like(column.obj, 1, stream=df.stream),
357
+ dtype=self.dtype,
358
+ )
341
359
  return Column(
342
360
  plc.strings.combine.join_strings(
343
361
  column.obj,
344
- plc.Scalar.from_py(delimiter, self.dtype.plc),
345
- plc.Scalar.from_py(None, self.dtype.plc),
362
+ plc.Scalar.from_py(
363
+ delimiter, self.dtype.plc_type, stream=df.stream
364
+ ),
365
+ plc.Scalar.from_py(None, self.dtype.plc_type, stream=df.stream),
366
+ stream=df.stream,
346
367
  ),
347
368
  dtype=self.dtype,
348
369
  )
@@ -351,18 +372,24 @@ class StringFunction(Expr):
351
372
  # polars pads based on bytes, libcudf by visual width
352
373
  # only pass chars if the visual width matches the byte length
353
374
  column = self.children[0].evaluate(df, context=context)
354
- col_len_bytes = plc.strings.attributes.count_bytes(column.obj)
355
- col_len_chars = plc.strings.attributes.count_characters(column.obj)
375
+ col_len_bytes = plc.strings.attributes.count_bytes(
376
+ column.obj, stream=df.stream
377
+ )
378
+ col_len_chars = plc.strings.attributes.count_characters(
379
+ column.obj, stream=df.stream
380
+ )
356
381
  equal = plc.binaryop.binary_operation(
357
382
  col_len_bytes,
358
383
  col_len_chars,
359
384
  plc.binaryop.BinaryOperator.NULL_EQUALS,
360
385
  plc.DataType(plc.TypeId.BOOL8),
386
+ stream=df.stream,
361
387
  )
362
388
  if not plc.reduce.reduce(
363
389
  equal,
364
390
  plc.aggregation.all(),
365
391
  plc.DataType(plc.TypeId.BOOL8),
392
+ stream=df.stream,
366
393
  ).to_py():
367
394
  raise InvalidOperationError(
368
395
  "zfill only supports ascii strings with no unicode characters"
@@ -373,22 +400,31 @@ class StringFunction(Expr):
373
400
  if width.value is None:
374
401
  return Column(
375
402
  plc.Column.from_scalar(
376
- plc.Scalar.from_py(None, self.dtype.plc),
403
+ plc.Scalar.from_py(
404
+ None, self.dtype.plc_type, stream=df.stream
405
+ ),
377
406
  column.size,
407
+ stream=df.stream,
378
408
  ),
379
409
  self.dtype,
380
410
  )
381
411
  return Column(
382
- plc.strings.padding.zfill(column.obj, width.value), self.dtype
412
+ plc.strings.padding.zfill(
413
+ column.obj, width.value, stream=df.stream
414
+ ),
415
+ self.dtype,
383
416
  )
384
417
  else:
385
418
  col_width = self.children[1].evaluate(df, context=context)
386
419
  assert isinstance(col_width, Column)
387
420
  all_gt_0 = plc.binaryop.binary_operation(
388
421
  col_width.obj,
389
- plc.Scalar.from_py(0, plc.DataType(plc.TypeId.INT64)),
422
+ plc.Scalar.from_py(
423
+ 0, plc.DataType(plc.TypeId.INT64), stream=df.stream
424
+ ),
390
425
  plc.binaryop.BinaryOperator.GREATER_EQUAL,
391
426
  plc.DataType(plc.TypeId.BOOL8),
427
+ stream=df.stream,
392
428
  )
393
429
 
394
430
  if (
@@ -397,12 +433,15 @@ class StringFunction(Expr):
397
433
  all_gt_0,
398
434
  plc.aggregation.all(),
399
435
  plc.DataType(plc.TypeId.BOOL8),
436
+ stream=df.stream,
400
437
  ).to_py()
401
438
  ): # pragma: no cover
402
439
  raise InvalidOperationError("fill conversion failed.")
403
440
 
404
441
  return Column(
405
- plc.strings.padding.zfill_by_widths(column.obj, col_width.obj),
442
+ plc.strings.padding.zfill_by_widths(
443
+ column.obj, col_width.obj, stream=df.stream
444
+ ),
406
445
  self.dtype,
407
446
  )
408
447
 
@@ -414,34 +453,39 @@ class StringFunction(Expr):
414
453
  if literal:
415
454
  pat = arg.evaluate(df, context=context)
416
455
  pattern = (
417
- pat.obj_scalar
456
+ pat.obj_scalar(stream=df.stream)
418
457
  if pat.is_scalar and pat.size != column.size
419
458
  else pat.obj
420
459
  )
421
460
  return Column(
422
- plc.strings.find.contains(column.obj, pattern), dtype=self.dtype
461
+ plc.strings.find.contains(column.obj, pattern, stream=df.stream),
462
+ dtype=self.dtype,
423
463
  )
424
464
  else:
425
465
  return Column(
426
- plc.strings.contains.contains_re(column.obj, self._regex_program),
466
+ plc.strings.contains.contains_re(
467
+ column.obj, self._regex_program, stream=df.stream
468
+ ),
427
469
  dtype=self.dtype,
428
470
  )
429
471
  elif self.name is StringFunction.Name.ContainsAny:
430
472
  (ascii_case_insensitive,) = self.options
431
473
  child, arg = self.children
432
- column = child.evaluate(df, context=context).obj
433
- targets = arg.evaluate(df, context=context).obj
474
+ plc_column = child.evaluate(df, context=context).obj
475
+ plc_targets = arg.evaluate(df, context=context).obj
434
476
  if ascii_case_insensitive:
435
- column = plc.strings.case.to_lower(column)
436
- targets = plc.strings.case.to_lower(targets)
477
+ plc_column = plc.strings.case.to_lower(plc_column, stream=df.stream)
478
+ plc_targets = plc.strings.case.to_lower(plc_targets, stream=df.stream)
437
479
  contains = plc.strings.find_multiple.contains_multiple(
438
- column,
439
- targets,
480
+ plc_column,
481
+ plc_targets,
482
+ stream=df.stream,
440
483
  )
441
484
  binary_or = functools.partial(
442
485
  plc.binaryop.binary_operation,
443
486
  op=plc.binaryop.BinaryOperator.BITWISE_OR,
444
- output_type=self.dtype.plc,
487
+ output_type=self.dtype.plc_type,
488
+ stream=df.stream,
445
489
  )
446
490
  return Column(
447
491
  functools.reduce(binary_or, contains.columns()),
@@ -449,28 +493,30 @@ class StringFunction(Expr):
449
493
  )
450
494
  elif self.name is StringFunction.Name.CountMatches:
451
495
  (child, _) = self.children
452
- column = child.evaluate(df, context=context).obj
496
+ plc_column = child.evaluate(df, context=context).obj
453
497
  return Column(
454
498
  plc.unary.cast(
455
- plc.strings.contains.count_re(column, self._regex_program),
456
- self.dtype.plc,
499
+ plc.strings.contains.count_re(
500
+ plc_column, self._regex_program, stream=df.stream
501
+ ),
502
+ self.dtype.plc_type,
503
+ stream=df.stream,
457
504
  ),
458
505
  dtype=self.dtype,
459
506
  )
460
507
  elif self.name is StringFunction.Name.Extract:
461
508
  (group_index,) = self.options
462
- column = self.children[0].evaluate(df, context=context).obj
509
+ plc_column = self.children[0].evaluate(df, context=context).obj
463
510
  return Column(
464
511
  plc.strings.extract.extract_single(
465
- column, self._regex_program, group_index - 1
512
+ plc_column, self._regex_program, group_index - 1, stream=df.stream
466
513
  ),
467
514
  dtype=self.dtype,
468
515
  )
469
516
  elif self.name is StringFunction.Name.ExtractGroups:
470
- column = self.children[0].evaluate(df, context=context).obj
517
+ plc_column = self.children[0].evaluate(df, context=context).obj
471
518
  plc_table = plc.strings.extract.extract(
472
- column,
473
- self._regex_program,
519
+ plc_column, self._regex_program, stream=df.stream
474
520
  )
475
521
  return Column(
476
522
  plc.Column.struct_from_children(plc_table.columns()),
@@ -479,38 +525,45 @@ class StringFunction(Expr):
479
525
  elif self.name is StringFunction.Name.Find:
480
526
  literal, _ = self.options
481
527
  (child, expr) = self.children
482
- column = child.evaluate(df, context=context).obj
528
+ plc_column = child.evaluate(df, context=context).obj
483
529
  if literal:
484
530
  assert isinstance(expr, Literal)
485
531
  plc_column = plc.strings.find.find(
486
- column,
487
- plc.Scalar.from_py(expr.value, expr.dtype.plc),
532
+ plc_column,
533
+ plc.Scalar.from_py(
534
+ expr.value, expr.dtype.plc_type, stream=df.stream
535
+ ),
536
+ stream=df.stream,
488
537
  )
489
538
  else:
490
539
  plc_column = plc.strings.findall.find_re(
491
- column,
492
- self._regex_program,
540
+ plc_column, self._regex_program, stream=df.stream
493
541
  )
494
542
  # Polars returns None for not found, libcudf returns -1
495
543
  new_mask, null_count = plc.transform.bools_to_mask(
496
544
  plc.binaryop.binary_operation(
497
545
  plc_column,
498
- plc.Scalar.from_py(-1, plc_column.type()),
546
+ plc.Scalar.from_py(-1, plc_column.type(), stream=df.stream),
499
547
  plc.binaryop.BinaryOperator.NOT_EQUAL,
500
548
  plc.DataType(plc.TypeId.BOOL8),
501
- )
549
+ stream=df.stream,
550
+ ),
551
+ stream=df.stream,
502
552
  )
503
553
  plc_column = plc.unary.cast(
504
- plc_column.with_mask(new_mask, null_count), self.dtype.plc
554
+ plc_column.with_mask(new_mask, null_count),
555
+ self.dtype.plc_type,
556
+ stream=df.stream,
505
557
  )
506
558
  return Column(plc_column, dtype=self.dtype)
507
559
  elif self.name is StringFunction.Name.JsonDecode:
508
560
  plc_column = self.children[0].evaluate(df, context=context).obj
509
561
  plc_table_with_metadata = plc.io.json.read_json_from_string_column(
510
562
  plc_column,
511
- plc.Scalar.from_py("\n"),
512
- plc.Scalar.from_py("NULL"),
563
+ plc.Scalar.from_py("\n", stream=df.stream),
564
+ plc.Scalar.from_py("NULL", stream=df.stream),
513
565
  _dtypes_for_json_decode(self.dtype),
566
+ stream=df.stream,
514
567
  )
515
568
  return Column(
516
569
  plc.Column.struct_from_children(plc_table_with_metadata.columns),
@@ -518,26 +571,34 @@ class StringFunction(Expr):
518
571
  )
519
572
  elif self.name is StringFunction.Name.JsonPathMatch:
520
573
  (child, expr) = self.children
521
- column = child.evaluate(df, context=context).obj
574
+ plc_column = child.evaluate(df, context=context).obj
522
575
  assert isinstance(expr, Literal)
523
- json_path = plc.Scalar.from_py(expr.value, expr.dtype.plc)
576
+ json_path = plc.Scalar.from_py(
577
+ expr.value, expr.dtype.plc_type, stream=df.stream
578
+ )
524
579
  return Column(
525
- plc.json.get_json_object(column, json_path),
580
+ plc.json.get_json_object(plc_column, json_path, stream=df.stream),
526
581
  dtype=self.dtype,
527
582
  )
528
583
  elif self.name is StringFunction.Name.LenBytes:
529
- column = self.children[0].evaluate(df, context=context).obj
584
+ plc_column = self.children[0].evaluate(df, context=context).obj
530
585
  return Column(
531
586
  plc.unary.cast(
532
- plc.strings.attributes.count_bytes(column), self.dtype.plc
587
+ plc.strings.attributes.count_bytes(plc_column, stream=df.stream),
588
+ self.dtype.plc_type,
589
+ stream=df.stream,
533
590
  ),
534
591
  dtype=self.dtype,
535
592
  )
536
593
  elif self.name is StringFunction.Name.LenChars:
537
- column = self.children[0].evaluate(df, context=context).obj
594
+ plc_column = self.children[0].evaluate(df, context=context).obj
538
595
  return Column(
539
596
  plc.unary.cast(
540
- plc.strings.attributes.count_characters(column), self.dtype.plc
597
+ plc.strings.attributes.count_characters(
598
+ plc_column, stream=df.stream
599
+ ),
600
+ self.dtype.plc_type,
601
+ stream=df.stream,
541
602
  ),
542
603
  dtype=self.dtype,
543
604
  )
@@ -567,8 +628,13 @@ class StringFunction(Expr):
567
628
  return Column(
568
629
  plc.strings.slice.slice_strings(
569
630
  column.obj,
570
- plc.Scalar.from_py(start, plc.DataType(plc.TypeId.INT32)),
571
- plc.Scalar.from_py(stop, plc.DataType(plc.TypeId.INT32)),
631
+ plc.Scalar.from_py(
632
+ start, plc.DataType(plc.TypeId.INT32), stream=df.stream
633
+ ),
634
+ plc.Scalar.from_py(
635
+ stop, plc.DataType(plc.TypeId.INT32), stream=df.stream
636
+ ),
637
+ stream=df.stream,
572
638
  ),
573
639
  dtype=self.dtype,
574
640
  )
@@ -582,7 +648,7 @@ class StringFunction(Expr):
582
648
  column = child.evaluate(df, context=context)
583
649
  if n == 1 and self.name is StringFunction.Name.SplitN:
584
650
  plc_column = plc.Column(
585
- self.dtype.plc,
651
+ self.dtype.plc_type,
586
652
  column.obj.size(),
587
653
  None,
588
654
  None,
@@ -592,7 +658,9 @@ class StringFunction(Expr):
592
658
  )
593
659
  else:
594
660
  assert isinstance(expr, Literal)
595
- by = plc.Scalar.from_py(expr.value, expr.dtype.plc)
661
+ by = plc.Scalar.from_py(
662
+ expr.value, expr.dtype.plc_type, stream=df.stream
663
+ )
596
664
  # See https://github.com/pola-rs/polars/issues/11640
597
665
  # for SplitN vs SplitExact edge case behaviors
598
666
  max_splits = n if is_split_n else 0
@@ -600,13 +668,16 @@ class StringFunction(Expr):
600
668
  column.obj,
601
669
  by,
602
670
  max_splits - 1,
671
+ stream=df.stream,
603
672
  )
604
673
  children = plc_table.columns()
605
674
  ref_column = children[0]
606
675
  if (remainder := n - len(children)) > 0:
607
676
  # Reach expected number of splits by padding with nulls
608
677
  children.extend(
609
- plc.Column.all_null_like(ref_column, ref_column.size())
678
+ plc.Column.all_null_like(
679
+ ref_column, ref_column.size(), stream=df.stream
680
+ )
610
681
  for _ in range(remainder + int(not is_split_n))
611
682
  )
612
683
  if not is_split_n:
@@ -614,7 +685,7 @@ class StringFunction(Expr):
614
685
  # TODO: Use plc.Column.struct_from_children once it is generalized
615
686
  # to handle columns that don't share the same null_mask/null_count
616
687
  plc_column = plc.Column(
617
- self.dtype.plc,
688
+ self.dtype.plc_type,
618
689
  ref_column.size(),
619
690
  None,
620
691
  None,
@@ -628,9 +699,11 @@ class StringFunction(Expr):
628
699
  StringFunction.Name.StripSuffix,
629
700
  }:
630
701
  child, expr = self.children
631
- column = child.evaluate(df, context=context).obj
702
+ plc_column = child.evaluate(df, context=context).obj
632
703
  assert isinstance(expr, Literal)
633
- target = plc.Scalar.from_py(expr.value, expr.dtype.plc)
704
+ target = plc.Scalar.from_py(
705
+ expr.value, expr.dtype.plc_type, stream=df.stream
706
+ )
634
707
  if self.name == StringFunction.Name.StripPrefix:
635
708
  find = plc.strings.find.starts_with
636
709
  start = len(expr.value)
@@ -640,17 +713,23 @@ class StringFunction(Expr):
640
713
  start = 0
641
714
  end = -len(expr.value)
642
715
 
643
- mask = find(column, target)
716
+ mask = find(plc_column, target, stream=df.stream)
644
717
  sliced = plc.strings.slice.slice_strings(
645
- column,
646
- plc.Scalar.from_py(start, plc.DataType(plc.TypeId.INT32)),
647
- plc.Scalar.from_py(end, plc.DataType(plc.TypeId.INT32)),
718
+ plc_column,
719
+ plc.Scalar.from_py(
720
+ start, plc.DataType(plc.TypeId.INT32), stream=df.stream
721
+ ),
722
+ plc.Scalar.from_py(
723
+ end, plc.DataType(plc.TypeId.INT32), stream=df.stream
724
+ ),
725
+ stream=df.stream,
648
726
  )
649
727
  return Column(
650
728
  plc.copying.copy_if_else(
651
729
  sliced,
652
- column,
730
+ plc_column,
653
731
  mask,
732
+ stream=df.stream,
654
733
  ),
655
734
  dtype=self.dtype,
656
735
  )
@@ -667,7 +746,12 @@ class StringFunction(Expr):
667
746
  else:
668
747
  side = plc.strings.SideType.BOTH
669
748
  return Column(
670
- plc.strings.strip.strip(column.obj, side, chars.obj_scalar),
749
+ plc.strings.strip.strip(
750
+ column.obj,
751
+ side,
752
+ chars.obj_scalar(stream=df.stream),
753
+ stream=df.stream,
754
+ ),
671
755
  dtype=self.dtype,
672
756
  )
673
757
 
@@ -678,15 +762,17 @@ class StringFunction(Expr):
678
762
  if self.children[1].value is None:
679
763
  return Column(
680
764
  plc.Column.from_scalar(
681
- plc.Scalar.from_py(None, self.dtype.plc),
765
+ plc.Scalar.from_py(None, self.dtype.plc_type, stream=df.stream),
682
766
  column.size,
767
+ stream=df.stream,
683
768
  ),
684
769
  self.dtype,
685
770
  )
686
771
  elif self.children[1].value == 0:
687
772
  result = plc.Column.from_scalar(
688
- plc.Scalar.from_py("", self.dtype.plc),
773
+ plc.Scalar.from_py("", self.dtype.plc_type, stream=df.stream),
689
774
  column.size,
775
+ stream=df.stream,
690
776
  )
691
777
  if column.obj.null_mask():
692
778
  result = result.with_mask(
@@ -700,9 +786,14 @@ class StringFunction(Expr):
700
786
  return Column(
701
787
  plc.strings.slice.slice_strings(
702
788
  column.obj,
703
- plc.Scalar.from_py(start, plc.DataType(plc.TypeId.INT32)),
704
- plc.Scalar.from_py(end, plc.DataType(plc.TypeId.INT32)),
789
+ plc.Scalar.from_py(
790
+ start, plc.DataType(plc.TypeId.INT32), stream=df.stream
791
+ ),
792
+ plc.Scalar.from_py(
793
+ end, plc.DataType(plc.TypeId.INT32), stream=df.stream
794
+ ),
705
795
  None,
796
+ stream=df.stream,
706
797
  ),
707
798
  self.dtype,
708
799
  )
@@ -715,16 +806,22 @@ class StringFunction(Expr):
715
806
  if end is None:
716
807
  return Column(
717
808
  plc.Column.from_scalar(
718
- plc.Scalar.from_py(None, self.dtype.plc),
809
+ plc.Scalar.from_py(None, self.dtype.plc_type, stream=df.stream),
719
810
  column.size,
811
+ stream=df.stream,
720
812
  ),
721
813
  self.dtype,
722
814
  )
723
815
  return Column(
724
816
  plc.strings.slice.slice_strings(
725
817
  column.obj,
726
- plc.Scalar.from_py(0, plc.DataType(plc.TypeId.INT32)),
727
- plc.Scalar.from_py(end, plc.DataType(plc.TypeId.INT32)),
818
+ plc.Scalar.from_py(
819
+ 0, plc.DataType(plc.TypeId.INT32), stream=df.stream
820
+ ),
821
+ plc.Scalar.from_py(
822
+ end, plc.DataType(plc.TypeId.INT32), stream=df.stream
823
+ ),
824
+ stream=df.stream,
728
825
  ),
729
826
  self.dtype,
730
827
  )
@@ -732,18 +829,25 @@ class StringFunction(Expr):
732
829
  columns = [child.evaluate(df, context=context) for child in self.children]
733
830
  if self.name is StringFunction.Name.Lowercase:
734
831
  (column,) = columns
735
- return Column(plc.strings.case.to_lower(column.obj), dtype=self.dtype)
832
+ return Column(
833
+ plc.strings.case.to_lower(column.obj, stream=df.stream),
834
+ dtype=self.dtype,
835
+ )
736
836
  elif self.name is StringFunction.Name.Uppercase:
737
837
  (column,) = columns
738
- return Column(plc.strings.case.to_upper(column.obj), dtype=self.dtype)
838
+ return Column(
839
+ plc.strings.case.to_upper(column.obj, stream=df.stream),
840
+ dtype=self.dtype,
841
+ )
739
842
  elif self.name is StringFunction.Name.EndsWith:
740
843
  column, suffix = columns
741
844
  return Column(
742
845
  plc.strings.find.ends_with(
743
846
  column.obj,
744
- suffix.obj_scalar
847
+ suffix.obj_scalar(stream=df.stream)
745
848
  if column.size != suffix.size and suffix.is_scalar
746
849
  else suffix.obj,
850
+ stream=df.stream,
747
851
  ),
748
852
  dtype=self.dtype,
749
853
  )
@@ -752,9 +856,10 @@ class StringFunction(Expr):
752
856
  return Column(
753
857
  plc.strings.find.starts_with(
754
858
  column.obj,
755
- prefix.obj_scalar
859
+ prefix.obj_scalar(stream=df.stream)
756
860
  if column.size != prefix.size and prefix.is_scalar
757
861
  else prefix.obj,
862
+ stream=df.stream,
758
863
  ),
759
864
  dtype=self.dtype,
760
865
  )
@@ -766,67 +871,80 @@ class StringFunction(Expr):
766
871
  if plc_col.null_count() == plc_col.size():
767
872
  return Column(
768
873
  plc.Column.from_scalar(
769
- plc.Scalar.from_py(None, self.dtype.plc),
874
+ plc.Scalar.from_py(None, self.dtype.plc_type, stream=df.stream),
770
875
  plc_col.size(),
876
+ stream=df.stream,
771
877
  ),
772
878
  self.dtype,
773
879
  )
774
880
  if format is None:
775
881
  # Polars begins inference with the first non null value
776
882
  if plc_col.null_mask() is not None:
777
- boolmask = plc.unary.is_valid(plc_col)
883
+ boolmask = plc.unary.is_valid(plc_col, stream=df.stream)
778
884
  table = plc.stream_compaction.apply_boolean_mask(
779
- plc.Table([plc_col]), boolmask
885
+ plc.Table([plc_col]), boolmask, stream=df.stream
780
886
  )
781
887
  filtered = table.columns()[0]
782
- first_valid_data = plc.copying.get_element(filtered, 0).to_py()
888
+ first_valid_data = plc.copying.get_element(
889
+ filtered, 0, stream=df.stream
890
+ ).to_py()
783
891
  else:
784
- first_valid_data = plc.copying.get_element(plc_col, 0).to_py()
892
+ first_valid_data = plc.copying.get_element(
893
+ plc_col, 0, stream=df.stream
894
+ ).to_py()
785
895
 
786
- format = _infer_datetime_format(first_valid_data)
896
+ # See https://github.com/rapidsai/cudf/issues/20202 for we type ignore
897
+ format = _infer_datetime_format(first_valid_data) # type: ignore[arg-type]
787
898
  if not format:
788
899
  raise InvalidOperationError(
789
900
  "Unable to infer datetime format from data"
790
901
  )
791
902
 
792
903
  is_timestamps = plc.strings.convert.convert_datetime.is_timestamp(
793
- plc_col, format
904
+ plc_col, format, stream=df.stream
794
905
  )
795
906
  if strict:
796
907
  if not plc.reduce.reduce(
797
908
  is_timestamps,
798
909
  plc.aggregation.all(),
799
910
  plc.DataType(plc.TypeId.BOOL8),
911
+ stream=df.stream,
800
912
  ).to_py():
801
913
  raise InvalidOperationError("conversion from `str` failed.")
802
914
  else:
803
915
  not_timestamps = plc.unary.unary_operation(
804
- is_timestamps, plc.unary.UnaryOperator.NOT
916
+ is_timestamps, plc.unary.UnaryOperator.NOT, stream=df.stream
805
917
  )
806
- null = plc.Scalar.from_py(None, plc_col.type())
918
+ null = plc.Scalar.from_py(None, plc_col.type(), stream=df.stream)
807
919
  plc_col = plc.copying.boolean_mask_scatter(
808
- [null], plc.Table([plc_col]), not_timestamps
920
+ [null], plc.Table([plc_col]), not_timestamps, stream=df.stream
809
921
  ).columns()[0]
810
922
 
811
923
  return Column(
812
924
  plc.strings.convert.convert_datetime.to_timestamps(
813
- plc_col, self.dtype.plc, format
925
+ plc_col, self.dtype.plc_type, format, stream=df.stream
814
926
  ),
815
927
  dtype=self.dtype,
816
928
  )
817
929
  elif self.name is StringFunction.Name.Replace:
818
- column, target, repl = columns
930
+ col_column, col_target, col_repl = columns
819
931
  n, _ = self.options
820
932
  return Column(
821
933
  plc.strings.replace.replace(
822
- column.obj, target.obj_scalar, repl.obj_scalar, maxrepl=n
934
+ col_column.obj,
935
+ col_target.obj_scalar(stream=df.stream),
936
+ col_repl.obj_scalar(stream=df.stream),
937
+ maxrepl=n,
938
+ stream=df.stream,
823
939
  ),
824
940
  dtype=self.dtype,
825
941
  )
826
942
  elif self.name is StringFunction.Name.ReplaceMany:
827
- column, target, repl = columns
943
+ col_column, col_target, col_repl = columns
828
944
  return Column(
829
- plc.strings.replace.replace_multiple(column.obj, target.obj, repl.obj),
945
+ plc.strings.replace.replace_multiple(
946
+ col_column.obj, col_target.obj, col_repl.obj, stream=df.stream
947
+ ),
830
948
  dtype=self.dtype,
831
949
  )
832
950
  elif self.name is StringFunction.Name.PadStart:
@@ -838,10 +956,15 @@ class StringFunction(Expr):
838
956
  (char,) = self.options
839
957
  # TODO: Maybe accept a string scalar in
840
958
  # cudf::strings::pad to avoid DtoH transfer
841
- width = width_col.obj.to_scalar().to_py()
959
+ # See https://github.com/rapidsai/cudf/issues/20202 for we type ignore
960
+ width: int = width_col.obj.to_scalar(stream=df.stream).to_py() # type: ignore[no-redef]
842
961
  return Column(
843
962
  plc.strings.padding.pad(
844
- column.obj, width, plc.strings.SideType.LEFT, char
963
+ column.obj,
964
+ width, # type: ignore[arg-type]
965
+ plc.strings.SideType.LEFT,
966
+ char,
967
+ stream=df.stream,
845
968
  ),
846
969
  dtype=self.dtype,
847
970
  )
@@ -854,19 +977,29 @@ class StringFunction(Expr):
854
977
  (char,) = self.options
855
978
  # TODO: Maybe accept a string scalar in
856
979
  # cudf::strings::pad to avoid DtoH transfer
857
- width = width_col.obj.to_scalar().to_py()
980
+ width: int = width_col.obj.to_scalar(stream=df.stream).to_py() # type: ignore[no-redef]
858
981
  return Column(
859
982
  plc.strings.padding.pad(
860
- column.obj, width, plc.strings.SideType.RIGHT, char
983
+ column.obj,
984
+ width, # type: ignore[arg-type]
985
+ plc.strings.SideType.RIGHT,
986
+ char,
987
+ stream=df.stream,
861
988
  ),
862
989
  dtype=self.dtype,
863
990
  )
864
991
  elif self.name is StringFunction.Name.Reverse:
865
992
  (column,) = columns
866
- return Column(plc.strings.reverse.reverse(column.obj), dtype=self.dtype)
993
+ return Column(
994
+ plc.strings.reverse.reverse(column.obj, stream=df.stream),
995
+ dtype=self.dtype,
996
+ )
867
997
  elif self.name is StringFunction.Name.Titlecase:
868
998
  (column,) = columns
869
- return Column(plc.strings.capitalize.title(column.obj), dtype=self.dtype)
999
+ return Column(
1000
+ plc.strings.capitalize.title(column.obj, stream=df.stream),
1001
+ dtype=self.dtype,
1002
+ )
870
1003
  raise NotImplementedError(
871
1004
  f"StringFunction {self.name}"
872
1005
  ) # pragma: no cover; handled by init raising