cudf-polars-cu13 25.10.0__py3-none-any.whl → 26.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. cudf_polars/GIT_COMMIT +1 -1
  2. cudf_polars/VERSION +1 -1
  3. cudf_polars/callback.py +60 -15
  4. cudf_polars/containers/column.py +137 -77
  5. cudf_polars/containers/dataframe.py +123 -34
  6. cudf_polars/containers/datatype.py +134 -13
  7. cudf_polars/dsl/expr.py +0 -2
  8. cudf_polars/dsl/expressions/aggregation.py +80 -28
  9. cudf_polars/dsl/expressions/binaryop.py +34 -14
  10. cudf_polars/dsl/expressions/boolean.py +110 -37
  11. cudf_polars/dsl/expressions/datetime.py +59 -30
  12. cudf_polars/dsl/expressions/literal.py +11 -5
  13. cudf_polars/dsl/expressions/rolling.py +460 -119
  14. cudf_polars/dsl/expressions/selection.py +9 -8
  15. cudf_polars/dsl/expressions/slicing.py +1 -1
  16. cudf_polars/dsl/expressions/string.py +256 -114
  17. cudf_polars/dsl/expressions/struct.py +19 -7
  18. cudf_polars/dsl/expressions/ternary.py +33 -3
  19. cudf_polars/dsl/expressions/unary.py +126 -64
  20. cudf_polars/dsl/ir.py +1053 -350
  21. cudf_polars/dsl/to_ast.py +30 -13
  22. cudf_polars/dsl/tracing.py +194 -0
  23. cudf_polars/dsl/translate.py +307 -107
  24. cudf_polars/dsl/utils/aggregations.py +43 -30
  25. cudf_polars/dsl/utils/reshape.py +14 -2
  26. cudf_polars/dsl/utils/rolling.py +12 -8
  27. cudf_polars/dsl/utils/windows.py +35 -20
  28. cudf_polars/experimental/base.py +55 -2
  29. cudf_polars/experimental/benchmarks/pdsds.py +12 -126
  30. cudf_polars/experimental/benchmarks/pdsh.py +792 -2
  31. cudf_polars/experimental/benchmarks/utils.py +596 -39
  32. cudf_polars/experimental/dask_registers.py +47 -20
  33. cudf_polars/experimental/dispatch.py +9 -3
  34. cudf_polars/experimental/distinct.py +2 -0
  35. cudf_polars/experimental/explain.py +15 -2
  36. cudf_polars/experimental/expressions.py +30 -15
  37. cudf_polars/experimental/groupby.py +25 -4
  38. cudf_polars/experimental/io.py +156 -124
  39. cudf_polars/experimental/join.py +53 -23
  40. cudf_polars/experimental/parallel.py +68 -19
  41. cudf_polars/experimental/rapidsmpf/__init__.py +8 -0
  42. cudf_polars/experimental/rapidsmpf/collectives/__init__.py +9 -0
  43. cudf_polars/experimental/rapidsmpf/collectives/allgather.py +90 -0
  44. cudf_polars/experimental/rapidsmpf/collectives/common.py +96 -0
  45. cudf_polars/experimental/rapidsmpf/collectives/shuffle.py +253 -0
  46. cudf_polars/experimental/rapidsmpf/core.py +488 -0
  47. cudf_polars/experimental/rapidsmpf/dask.py +172 -0
  48. cudf_polars/experimental/rapidsmpf/dispatch.py +153 -0
  49. cudf_polars/experimental/rapidsmpf/io.py +696 -0
  50. cudf_polars/experimental/rapidsmpf/join.py +322 -0
  51. cudf_polars/experimental/rapidsmpf/lower.py +74 -0
  52. cudf_polars/experimental/rapidsmpf/nodes.py +735 -0
  53. cudf_polars/experimental/rapidsmpf/repartition.py +216 -0
  54. cudf_polars/experimental/rapidsmpf/union.py +115 -0
  55. cudf_polars/experimental/rapidsmpf/utils.py +374 -0
  56. cudf_polars/experimental/repartition.py +9 -2
  57. cudf_polars/experimental/select.py +177 -14
  58. cudf_polars/experimental/shuffle.py +46 -12
  59. cudf_polars/experimental/sort.py +100 -26
  60. cudf_polars/experimental/spilling.py +1 -1
  61. cudf_polars/experimental/statistics.py +24 -5
  62. cudf_polars/experimental/utils.py +25 -7
  63. cudf_polars/testing/asserts.py +13 -8
  64. cudf_polars/testing/io.py +2 -1
  65. cudf_polars/testing/plugin.py +93 -17
  66. cudf_polars/typing/__init__.py +86 -32
  67. cudf_polars/utils/config.py +473 -58
  68. cudf_polars/utils/cuda_stream.py +70 -0
  69. cudf_polars/utils/versions.py +5 -4
  70. cudf_polars_cu13-26.2.0.dist-info/METADATA +181 -0
  71. cudf_polars_cu13-26.2.0.dist-info/RECORD +108 -0
  72. {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/WHEEL +1 -1
  73. cudf_polars_cu13-25.10.0.dist-info/METADATA +0 -136
  74. cudf_polars_cu13-25.10.0.dist-info/RECORD +0 -92
  75. {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/licenses/LICENSE +0 -0
  76. {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/top_level.txt +0 -0
@@ -28,7 +28,10 @@ import importlib.util
28
28
  import json
29
29
  import os
30
30
  import warnings
31
- from typing import TYPE_CHECKING, Literal, TypeVar
31
+ from typing import TYPE_CHECKING, Any, Literal, TypeVar
32
+
33
+ from rmm.pylibrmm.cuda_stream import CudaStreamFlags
34
+ from rmm.pylibrmm.cuda_stream_pool import CudaStreamPool
32
35
 
33
36
  if TYPE_CHECKING:
34
37
  from collections.abc import Callable
@@ -37,13 +40,18 @@ if TYPE_CHECKING:
37
40
 
38
41
  import polars.lazyframe.engine_config
39
42
 
43
+ import rmm.mr
44
+
40
45
 
41
46
  __all__ = [
47
+ "Cluster",
42
48
  "ConfigOptions",
43
49
  "InMemoryExecutor",
44
50
  "ParquetOptions",
45
- "Scheduler",
51
+ "Runtime",
52
+ "Scheduler", # Deprecated, kept for backward compatibility
46
53
  "ShuffleMethod",
54
+ "ShufflerInsertionMethod",
47
55
  "StatsPlanningOptions",
48
56
  "StreamingExecutor",
49
57
  "StreamingFallbackMode",
@@ -57,8 +65,9 @@ def _env_get_int(name: str, default: int) -> int:
57
65
  return default # pragma: no cover
58
66
 
59
67
 
60
- def get_total_device_memory() -> int | None:
61
- """Return the total memory of the current device."""
68
+ @functools.cache
69
+ def get_device_handle() -> Any:
70
+ # Gets called for each IR.do_evaluate node, so we'll cache it.
62
71
  import pynvml
63
72
 
64
73
  try:
@@ -74,12 +83,27 @@ def get_total_device_memory() -> int | None:
74
83
  handle = pynvml.nvmlDeviceGetDeviceHandleFromMigDeviceHandle(handle)
75
84
  else:
76
85
  handle = pynvml.nvmlDeviceGetHandleByIndex(int(index))
77
-
78
- return pynvml.nvmlDeviceGetMemoryInfo(handle).total
79
-
80
86
  except pynvml.NVMLError_NotSupported: # pragma: no cover
81
87
  # System doesn't have proper "GPU memory".
82
88
  return None
89
+ else:
90
+ return handle
91
+
92
+
93
+ def get_total_device_memory() -> int | None:
94
+ """Return the total memory of the current device."""
95
+ import pynvml
96
+
97
+ maybe_handle = get_device_handle()
98
+
99
+ if maybe_handle is not None:
100
+ try:
101
+ return pynvml.nvmlDeviceGetMemoryInfo(maybe_handle).total
102
+ except pynvml.NVMLError_NotSupported: # pragma: no cover
103
+ # System doesn't have proper "GPU memory".
104
+ return None
105
+ else: # pragma: no cover
106
+ return None
83
107
 
84
108
 
85
109
  @functools.cache
@@ -120,14 +144,43 @@ class StreamingFallbackMode(str, enum.Enum):
120
144
  SILENT = "silent"
121
145
 
122
146
 
147
+ class Runtime(str, enum.Enum):
148
+ """
149
+ The runtime to use for the streaming executor.
150
+
151
+ * ``Runtime.TASKS`` : Use the task-based runtime.
152
+ This is the default runtime.
153
+ * ``Runtime.RAPIDSMPF`` : Use the coroutine-based streaming runtime (rapidsmpf).
154
+ This runtime is experimental.
155
+ """
156
+
157
+ TASKS = "tasks"
158
+ RAPIDSMPF = "rapidsmpf"
159
+
160
+
161
+ class Cluster(str, enum.Enum):
162
+ """
163
+ The cluster configuration for the streaming executor.
164
+
165
+ * ``Cluster.SINGLE`` : Single-GPU execution. Currently uses a zero-dependency,
166
+ synchronous, single-threaded task scheduler.
167
+ * ``Cluster.DISTRIBUTED`` : Multi-GPU distributed execution. Currently
168
+ uses a Dask-based distributed scheduler and requires an
169
+ active Dask cluster.
170
+ """
171
+
172
+ SINGLE = "single"
173
+ DISTRIBUTED = "distributed"
174
+
175
+
123
176
  class Scheduler(str, enum.Enum):
124
177
  """
125
- The scheduler to use for the streaming executor.
178
+ **Deprecated**: Use :class:`Cluster` instead.
179
+
180
+ The scheduler to use for the task-based streaming executor.
126
181
 
127
- * ``Scheduler.SYNCHRONOUS`` : A zero-dependency, synchronous,
128
- single-threaded scheduler.
129
- * ``Scheduler.DISTRIBUTED`` : A Dask-based distributed scheduler.
130
- Using this scheduler requires an active Dask cluster.
182
+ * ``Scheduler.SYNCHRONOUS`` : Single-GPU execution (use ``Cluster.SINGLE`` instead)
183
+ * ``Scheduler.DISTRIBUTED`` : Multi-GPU execution (use ``Cluster.DISTRIBUTED`` instead)
131
184
  """
132
185
 
133
186
  SYNCHRONOUS = "synchronous"
@@ -143,12 +196,12 @@ class ShuffleMethod(str, enum.Enum):
143
196
  * ``ShuffleMethod._RAPIDSMPF_SINGLE`` : Use the single-process rapidsmpf shuffler.
144
197
 
145
198
  With :class:`cudf_polars.utils.config.StreamingExecutor`, the default of ``None``
146
- will attempt to use ``ShuffleMethod.RAPIDSMPF`` for the distributed scheduler,
199
+ will attempt to use ``ShuffleMethod.RAPIDSMPF`` for a distributed cluster,
147
200
  but will fall back to ``ShuffleMethod.TASKS`` if rapidsmpf is not installed.
148
201
 
149
202
  The user should **not** specify ``ShuffleMethod._RAPIDSMPF_SINGLE`` directly.
150
203
  A setting of ``ShuffleMethod.RAPIDSMPF`` will be converted to the single-process
151
- shuffler automatically when the 'synchronous' scheduler is active.
204
+ shuffler automatically when using single-GPU execution.
152
205
  """
153
206
 
154
207
  TASKS = "tasks"
@@ -156,6 +209,20 @@ class ShuffleMethod(str, enum.Enum):
156
209
  _RAPIDSMPF_SINGLE = "rapidsmpf-single"
157
210
 
158
211
 
212
+ class ShufflerInsertionMethod(str, enum.Enum):
213
+ """
214
+ The method to use for inserting chunks into the rapidsmpf shuffler.
215
+
216
+ * ``ShufflerInsertionMethod.INSERT_CHUNKS`` : Use insert_chunks for inserting data.
217
+ * ``ShufflerInsertionMethod.CONCAT_INSERT`` : Use concat_insert for inserting data.
218
+
219
+ Only applicable with the "rapidsmpf" shuffle method and the "tasks" runtime.
220
+ """
221
+
222
+ INSERT_CHUNKS = "insert_chunks"
223
+ CONCAT_INSERT = "concat_insert"
224
+
225
+
159
226
  T = TypeVar("T")
160
227
 
161
228
 
@@ -215,6 +282,10 @@ class ParquetOptions:
215
282
 
216
283
  Set to 0 to avoid row-group sampling. Note that row-group sampling
217
284
  will also be skipped if ``max_footer_samples`` is 0.
285
+ use_rapidsmpf_native
286
+ Whether to use the native rapidsmpf node for parquet reading.
287
+ This option is only used when the rapidsmpf runtime is enabled.
288
+ Default is True.
218
289
  """
219
290
 
220
291
  _env_prefix = "CUDF_POLARS__PARQUET_OPTIONS"
@@ -249,6 +320,13 @@ class ParquetOptions:
249
320
  f"{_env_prefix}__MAX_ROW_GROUP_SAMPLES", int, default=1
250
321
  )
251
322
  )
323
+ use_rapidsmpf_native: bool = dataclasses.field(
324
+ default_factory=_make_default_factory(
325
+ f"{_env_prefix}__USE_RAPIDSMPF_NATIVE",
326
+ _bool_converter,
327
+ default=True,
328
+ )
329
+ )
252
330
 
253
331
  def __post_init__(self) -> None: # noqa: D105
254
332
  if not isinstance(self.chunked, bool):
@@ -263,9 +341,11 @@ class ParquetOptions:
263
341
  raise TypeError("max_footer_samples must be an int")
264
342
  if not isinstance(self.max_row_group_samples, int):
265
343
  raise TypeError("max_row_group_samples must be an int")
344
+ if not isinstance(self.use_rapidsmpf_native, bool):
345
+ raise TypeError("use_rapidsmpf_native must be a bool")
266
346
 
267
347
 
268
- def default_blocksize(scheduler: str) -> int:
348
+ def default_blocksize(cluster: str) -> int:
269
349
  """Return the default blocksize."""
270
350
  device_size = get_total_device_memory()
271
351
  if device_size is None: # pragma: no cover
@@ -274,7 +354,7 @@ def default_blocksize(scheduler: str) -> int:
274
354
  return 1_000_000_000
275
355
 
276
356
  if (
277
- scheduler == "distributed"
357
+ cluster == "distributed"
278
358
  or _env_get_int("POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY", default=1) == 0
279
359
  ):
280
360
  # Distributed execution requires a conservative
@@ -370,6 +450,102 @@ class StatsPlanningOptions:
370
450
  raise TypeError("default_selectivity must be a float")
371
451
 
372
452
 
453
+ @dataclasses.dataclass(frozen=True, eq=True)
454
+ class MemoryResourceConfig:
455
+ """
456
+ Configuration for the default memory resource.
457
+
458
+ Parameters
459
+ ----------
460
+ qualname
461
+ The fully qualified name of the memory resource class to use.
462
+ options
463
+ This can be either a dictionary representing the options to pass
464
+ to the memory resource class, or, a dictionary representing a
465
+ nested memory resource configuration. The presence of "qualname"
466
+ field indicates a nested memory resource configuration.
467
+
468
+ Examples
469
+ --------
470
+ Create a memory resource config for a single memory resource:
471
+ >>> MemoryResourceConfig(
472
+ ... qualname="rmm.mr.CudaAsyncMemoryResource",
473
+ ... options={"initial_pool_size": 100},
474
+ ... )
475
+
476
+ Create a memory resource config for a nested memory resource configuration:
477
+ >>> MemoryResourceConfig(
478
+ ... qualname="rmm.mr.PrefetchResourceAdaptor",
479
+ ... options={
480
+ ... "upstream_mr": {
481
+ ... "qualname": "rmm.mr.PoolMemoryResource",
482
+ ... "options": {
483
+ ... "upstream_mr": {
484
+ ... "qualname": "rmm.mr.ManagedMemoryResource",
485
+ ... },
486
+ ... "initial_pool_size": 256,
487
+ ... },
488
+ ... }
489
+ ... },
490
+ ... )
491
+ """
492
+
493
+ _env_prefix = "CUDF_POLARS__MEMORY_RESOURCE_CONFIG"
494
+ qualname: str = dataclasses.field(
495
+ default_factory=_make_default_factory(
496
+ f"{_env_prefix}__QUALNAME",
497
+ str,
498
+ # We shouldn't reach here if qualname isn't set in the environment.
499
+ default=None, # type: ignore[assignment]
500
+ )
501
+ )
502
+ options: dict[str, Any] | None = dataclasses.field(
503
+ default_factory=_make_default_factory(
504
+ f"{_env_prefix}__OPTIONS",
505
+ json.loads,
506
+ default=None,
507
+ )
508
+ )
509
+
510
+ def __post_init__(self) -> None:
511
+ if self.qualname.count(".") < 1:
512
+ raise ValueError(
513
+ f"MemoryResourceConfig.qualname '{self.qualname}' must be a fully qualified name to a class, including the module name."
514
+ )
515
+
516
+ def create_memory_resource(self) -> rmm.mr.DeviceMemoryResource:
517
+ """Create a memory resource from the configuration."""
518
+
519
+ def create_mr(
520
+ qualname: str, options: dict[str, Any] | None
521
+ ) -> rmm.mr.DeviceMemoryResource:
522
+ module_name, class_name = qualname.rsplit(".", 1)
523
+ module = importlib.import_module(module_name)
524
+ cls = getattr(module, class_name)
525
+ return cls(**options or {})
526
+
527
+ def process_options(opts: dict[str, Any] | None) -> dict[str, Any]:
528
+ if opts is None:
529
+ return {}
530
+
531
+ processed = {}
532
+ for key, value in opts.items():
533
+ if isinstance(value, dict) and "qualname" in value:
534
+ # This is a nested memory resource config
535
+ nested_qualname = value["qualname"]
536
+ nested_options = process_options(value.get("options"))
537
+ processed[key] = create_mr(nested_qualname, nested_options)
538
+ else:
539
+ processed[key] = value
540
+ return processed
541
+
542
+ # Create the top-level memory resource
543
+ return create_mr(self.qualname, process_options(self.options))
544
+
545
+ def __hash__(self) -> int:
546
+ return hash((self.qualname, json.dumps(self.options, sort_keys=True)))
547
+
548
+
373
549
  @dataclasses.dataclass(frozen=True, eq=True)
374
550
  class StreamingExecutor:
375
551
  """
@@ -380,11 +556,25 @@ class StreamingExecutor:
380
556
 
381
557
  Parameters
382
558
  ----------
559
+ runtime
560
+ The runtime to use for the streaming executor.
561
+ ``Runtime.TASKS`` by default.
562
+ cluster
563
+ The cluster configuration for the streaming executor.
564
+ ``Cluster.SINGLE`` by default.
565
+
566
+ This setting applies to both task-based and rapidsmpf execution modes:
567
+
568
+ * ``Cluster.SINGLE``: Single-GPU execution
569
+ * ``Cluster.DISTRIBUTED``: Multi-GPU distributed execution (requires
570
+ an active Dask cluster)
571
+
383
572
  scheduler
384
- The scheduler to use for the streaming executor. ``Scheduler.SYNCHRONOUS``
385
- by default.
573
+ **Deprecated**: Use ``cluster`` instead.
386
574
 
387
- Note ``scheduler="distributed"`` requires a Dask cluster to be running.
575
+ For backward compatibility:
576
+ * ``Scheduler.SYNCHRONOUS`` maps to ``Cluster.SINGLE``
577
+ * ``Scheduler.DISTRIBUTED`` maps to ``Cluster.DISTRIBUTED``
388
578
  fallback_mode
389
579
  How to handle errors when the GPU engine fails to execute a query.
390
580
  ``StreamingFallbackMode.WARN`` by default.
@@ -414,10 +604,10 @@ class StreamingExecutor:
414
604
  - the ``CUDF_POLARS__EXECUTOR__TARGET_PARTITION_SIZE`` environment variable
415
605
 
416
606
  By default, cudf-polars uses a target partition size that's a fraction
417
- of the device memory, where the fraction depends on the scheduler:
607
+ of the device memory, where the fraction depends on the cluster:
418
608
 
419
609
  - distributed: 1/40th of the device memory
420
- - synchronous: 1/16th of the device memory
610
+ - single: 1/16th of the device memory
421
611
 
422
612
  The optional pynvml dependency is used to query the device memory size. If
423
613
  pynvml is not available, a warning is emitted and the device size is assumed
@@ -434,36 +624,62 @@ class StreamingExecutor:
434
624
  a broadcast join.
435
625
  shuffle_method
436
626
  The method to use for shuffling data between workers. Defaults to
437
- 'rapidsmpf' for distributed scheduler if available (otherwise 'tasks'),
438
- and 'tasks' for synchronous scheduler.
627
+ 'rapidsmpf' for distributed cluster if available (otherwise 'tasks'),
628
+ and 'tasks' for single-GPU cluster.
629
+ shuffler_insertion_method
630
+ The method to use for inserting chunks with the rapidsmpf shuffler.
631
+ Can be 'insert_chunks' (default) or 'concat_insert'.
632
+
633
+ Only applicable with ``shuffle_method="rapidsmpf"`` and ``runtime="tasks"``.
439
634
  rapidsmpf_spill
440
635
  Whether to wrap task arguments and output in objects that are
441
636
  spillable by 'rapidsmpf'.
637
+ client_device_threshold
638
+ Threshold for spilling data from device memory in rapidsmpf.
639
+ Default is 50% of device memory on the client process.
640
+ This argument is only used by the "rapidsmpf" runtime.
442
641
  sink_to_directory
443
642
  Whether multi-partition sink operations should write to a directory
444
643
  rather than a single file. By default, this will be set to True for
445
- the 'distributed' scheduler and False otherwise. The 'distrubuted'
446
- scheduler does not currently support ``sink_to_directory=False``.
644
+ the 'distributed' cluster and False otherwise. The 'distributed'
645
+ cluster does not currently support ``sink_to_directory=False``.
447
646
  stats_planning
448
647
  Options controlling statistics-based query planning. See
449
648
  :class:`~cudf_polars.utils.config.StatsPlanningOptions` for more.
649
+ max_io_threads
650
+ Maximum number of IO threads for the rapidsmpf runtime. Default is 2.
651
+ This controls the parallelism of IO operations when reading data.
450
652
 
451
653
  Notes
452
654
  -----
453
655
  The streaming executor does not currently support profiling a query via
454
656
  the ``.profile()`` method. We recommend using nsys to profile queries
455
- with the 'synchronous' scheduler and Dask's built-in profiling tools
456
- with the 'distributed' scheduler.
657
+ with single-GPU execution and Dask's built-in profiling tools
658
+ with distributed execution.
457
659
  """
458
660
 
459
661
  _env_prefix = "CUDF_POLARS__EXECUTOR"
460
662
 
461
663
  name: Literal["streaming"] = dataclasses.field(default="streaming", init=False)
462
- scheduler: Scheduler = dataclasses.field(
664
+ runtime: Runtime = dataclasses.field(
665
+ default_factory=_make_default_factory(
666
+ f"{_env_prefix}__RUNTIME",
667
+ Runtime.__call__,
668
+ default=Runtime.TASKS,
669
+ )
670
+ )
671
+ cluster: Cluster | None = dataclasses.field(
672
+ default_factory=_make_default_factory(
673
+ f"{_env_prefix}__CLUSTER",
674
+ Cluster.__call__,
675
+ default=None,
676
+ )
677
+ )
678
+ scheduler: Scheduler | None = dataclasses.field(
463
679
  default_factory=_make_default_factory(
464
680
  f"{_env_prefix}__SCHEDULER",
465
681
  Scheduler.__call__,
466
- default=Scheduler.SYNCHRONOUS,
682
+ default=None,
467
683
  )
468
684
  )
469
685
  fallback_mode: StreamingFallbackMode = dataclasses.field(
@@ -505,11 +721,23 @@ class StreamingExecutor:
505
721
  default=ShuffleMethod.TASKS,
506
722
  )
507
723
  )
724
+ shuffler_insertion_method: ShufflerInsertionMethod = dataclasses.field(
725
+ default_factory=_make_default_factory(
726
+ f"{_env_prefix}__SHUFFLER_INSERTION_METHOD",
727
+ ShufflerInsertionMethod.__call__,
728
+ default=ShufflerInsertionMethod.INSERT_CHUNKS,
729
+ )
730
+ )
508
731
  rapidsmpf_spill: bool = dataclasses.field(
509
732
  default_factory=_make_default_factory(
510
733
  f"{_env_prefix}__RAPIDSMPF_SPILL", _bool_converter, default=False
511
734
  )
512
735
  )
736
+ client_device_threshold: float = dataclasses.field(
737
+ default_factory=_make_default_factory(
738
+ f"{_env_prefix}__CLIENT_DEVICE_THRESHOLD", float, default=0.5
739
+ )
740
+ )
513
741
  sink_to_directory: bool | None = dataclasses.field(
514
742
  default_factory=_make_default_factory(
515
743
  f"{_env_prefix}__SINK_TO_DIRECTORY", _bool_converter, default=None
@@ -518,12 +746,64 @@ class StreamingExecutor:
518
746
  stats_planning: StatsPlanningOptions = dataclasses.field(
519
747
  default_factory=StatsPlanningOptions
520
748
  )
749
+ max_io_threads: int = dataclasses.field(
750
+ default_factory=_make_default_factory(
751
+ f"{_env_prefix}__MAX_IO_THREADS", int, default=2
752
+ )
753
+ )
521
754
 
522
755
  def __post_init__(self) -> None: # noqa: D105
756
+ # Check for rapidsmpf runtime
757
+ if self.runtime == "rapidsmpf": # pragma: no cover; requires rapidsmpf runtime
758
+ if not rapidsmpf_single_available():
759
+ raise ValueError("The rapidsmpf streaming engine requires rapidsmpf.")
760
+ if self.shuffle_method == "tasks":
761
+ raise ValueError(
762
+ "The rapidsmpf streaming engine does not support task-based shuffling."
763
+ )
764
+ object.__setattr__(self, "shuffle_method", "rapidsmpf")
765
+
766
+ # Handle backward compatibility for deprecated scheduler parameter
767
+ if self.scheduler is not None:
768
+ if self.cluster is not None:
769
+ raise ValueError(
770
+ "Cannot specify both 'scheduler' and 'cluster'. "
771
+ "The 'scheduler' parameter is deprecated. "
772
+ "Please use only 'cluster' instead."
773
+ )
774
+ else:
775
+ warnings.warn(
776
+ """The 'scheduler' parameter is deprecated. Please use 'cluster' instead.
777
+ Use 'cluster="single"' instead of 'scheduler="synchronous"' and "
778
+ 'cluster="distributed"' instead of 'scheduler="distributed"'.""",
779
+ FutureWarning,
780
+ stacklevel=2,
781
+ )
782
+ # Map old scheduler values to new cluster values
783
+ if self.scheduler == "synchronous":
784
+ object.__setattr__(self, "cluster", Cluster.SINGLE)
785
+ elif self.scheduler == "distributed":
786
+ object.__setattr__(self, "cluster", Cluster.DISTRIBUTED)
787
+ # Clear scheduler to avoid confusion
788
+ object.__setattr__(self, "scheduler", None)
789
+ elif self.cluster is None:
790
+ object.__setattr__(self, "cluster", Cluster.SINGLE)
791
+ assert self.cluster is not None, "Expected cluster to be set."
792
+
793
+ # Warn loudly that multi-GPU execution is under construction
794
+ # for the rapidsmpf runtime
795
+ if self.cluster == "distributed" and self.runtime == "rapidsmpf":
796
+ warnings.warn(
797
+ "UNDER CONSTRUCTION!!!"
798
+ "The rapidsmpf runtime does NOT support distributed execution yet. "
799
+ "Use at your own risk!!!",
800
+ stacklevel=2,
801
+ )
802
+
523
803
  # Handle shuffle_method defaults for streaming executor
524
804
  if self.shuffle_method is None:
525
- if self.scheduler == "distributed" and rapidsmpf_distributed_available():
526
- # For distributed scheduler, prefer rapidsmpf if available
805
+ if self.cluster == "distributed" and rapidsmpf_distributed_available():
806
+ # For distributed cluster, prefer rapidsmpf if available
527
807
  object.__setattr__(self, "shuffle_method", "rapidsmpf")
528
808
  else:
529
809
  # Otherwise, use task-based shuffle for now.
@@ -534,19 +814,16 @@ class StreamingExecutor:
534
814
  raise ValueError("rapidsmpf-single is not a supported shuffle method.")
535
815
  elif self.shuffle_method == "rapidsmpf":
536
816
  # Check that we have rapidsmpf installed
537
- if (
538
- self.scheduler == "distributed"
539
- and not rapidsmpf_distributed_available()
540
- ):
817
+ if self.cluster == "distributed" and not rapidsmpf_distributed_available():
541
818
  raise ValueError(
542
819
  "rapidsmpf shuffle method requested, but rapidsmpf.integrations.dask is not installed."
543
820
  )
544
- elif self.scheduler == "synchronous" and not rapidsmpf_single_available():
821
+ elif self.cluster == "single" and not rapidsmpf_single_available():
545
822
  raise ValueError(
546
823
  "rapidsmpf shuffle method requested, but rapidsmpf is not installed."
547
824
  )
548
- # Select "rapidsmpf-single" for the synchronous
549
- if self.scheduler == "synchronous":
825
+ # Select "rapidsmpf-single" for single-GPU
826
+ if self.cluster == "single":
550
827
  object.__setattr__(self, "shuffle_method", "rapidsmpf-single")
551
828
 
552
829
  # frozen dataclass, so use object.__setattr__
@@ -555,17 +832,24 @@ class StreamingExecutor:
555
832
  )
556
833
  if self.target_partition_size == 0:
557
834
  object.__setattr__(
558
- self, "target_partition_size", default_blocksize(self.scheduler)
835
+ self,
836
+ "target_partition_size",
837
+ default_blocksize(self.cluster),
559
838
  )
560
839
  if self.broadcast_join_limit == 0:
561
840
  object.__setattr__(
562
841
  self,
563
842
  "broadcast_join_limit",
564
- # Usually better to avoid shuffling for single gpu
565
- 2 if self.scheduler == "distributed" else 32,
843
+ # Usually better to avoid shuffling for single gpu with UVM
844
+ 2 if self.cluster == "distributed" else 32,
566
845
  )
567
- object.__setattr__(self, "scheduler", Scheduler(self.scheduler))
846
+ object.__setattr__(self, "cluster", Cluster(self.cluster))
568
847
  object.__setattr__(self, "shuffle_method", ShuffleMethod(self.shuffle_method))
848
+ object.__setattr__(
849
+ self,
850
+ "shuffler_insertion_method",
851
+ ShufflerInsertionMethod(self.shuffler_insertion_method),
852
+ )
569
853
 
570
854
  # Make sure stats_planning is a dataclass
571
855
  if isinstance(self.stats_planning, dict):
@@ -575,10 +859,10 @@ class StreamingExecutor:
575
859
  StatsPlanningOptions(**self.stats_planning),
576
860
  )
577
861
 
578
- if self.scheduler == "distributed":
862
+ if self.cluster == "distributed":
579
863
  if self.sink_to_directory is False:
580
864
  raise ValueError(
581
- "The distributed scheduler requires sink_to_directory=True"
865
+ "The distributed cluster requires sink_to_directory=True"
582
866
  )
583
867
  object.__setattr__(self, "sink_to_directory", True)
584
868
  elif self.sink_to_directory is None:
@@ -599,13 +883,17 @@ class StreamingExecutor:
599
883
  raise TypeError("rapidsmpf_spill must be bool")
600
884
  if not isinstance(self.sink_to_directory, bool):
601
885
  raise TypeError("sink_to_directory must be bool")
886
+ if not isinstance(self.client_device_threshold, float):
887
+ raise TypeError("client_device_threshold must be a float")
888
+ if not isinstance(self.max_io_threads, int):
889
+ raise TypeError("max_io_threads must be an int")
602
890
 
603
- # RapidsMPF spill is only supported for the distributed scheduler for now.
891
+ # RapidsMPF spill is only supported for distributed clusters for now.
604
892
  # This is because the spilling API is still within the RMPF-Dask integration.
605
893
  # (See https://github.com/rapidsai/rapidsmpf/issues/439)
606
- if self.scheduler == "synchronous" and self.rapidsmpf_spill: # pragma: no cover
894
+ if self.cluster == "single" and self.rapidsmpf_spill: # pragma: no cover
607
895
  raise ValueError(
608
- "rapidsmpf_spill is not supported for the synchronous scheduler."
896
+ "rapidsmpf_spill is not supported for single-GPU execution."
609
897
  )
610
898
 
611
899
  def __hash__(self) -> int: # noqa: D105
@@ -622,14 +910,83 @@ class InMemoryExecutor:
622
910
  """
623
911
  Configuration for the cudf-polars in-memory executor.
624
912
 
913
+ The in-memory executor only supports single-GPU execution.
914
+ """
915
+
916
+ name: Literal["in-memory"] = dataclasses.field(default="in-memory", init=False)
917
+
918
+
919
+ @dataclasses.dataclass(frozen=True, eq=True)
920
+ class CUDAStreamPoolConfig:
921
+ """
922
+ Configuration for the CUDA stream pool.
923
+
625
924
  Parameters
626
925
  ----------
627
- scheduler:
628
- The scheduler to use for the in-memory executor. Currently
629
- only ``Scheduler.SYNCHRONOUS`` is supported for the in-memory executor.
926
+ pool_size
927
+ The size of the CUDA stream pool.
928
+ flags
929
+ The flags to use for the CUDA stream pool.
630
930
  """
631
931
 
632
- name: Literal["in-memory"] = dataclasses.field(default="in-memory", init=False)
932
+ pool_size: int = 16
933
+ flags: CudaStreamFlags = CudaStreamFlags.NON_BLOCKING
934
+
935
+ def build(self) -> CudaStreamPool:
936
+ return CudaStreamPool(
937
+ pool_size=self.pool_size,
938
+ flags=self.flags,
939
+ )
940
+
941
+
942
+ class CUDAStreamPolicy(str, enum.Enum):
943
+ """
944
+ The policy to use for acquiring new CUDA streams.
945
+
946
+ * ``CUDAStreamPolicy.DEFAULT`` : Use the default CUDA stream.
947
+ * ``CUDAStreamPolicy.NEW`` : Create a new CUDA stream.
948
+ """
949
+
950
+ DEFAULT = "default"
951
+ NEW = "new"
952
+
953
+
954
+ def _convert_cuda_stream_policy(
955
+ user_cuda_stream_policy: dict | str,
956
+ ) -> CUDAStreamPolicy | CUDAStreamPoolConfig:
957
+ match user_cuda_stream_policy:
958
+ case "default" | "new":
959
+ return CUDAStreamPolicy(user_cuda_stream_policy)
960
+ case "pool":
961
+ return CUDAStreamPoolConfig()
962
+ case dict():
963
+ return CUDAStreamPoolConfig(**user_cuda_stream_policy)
964
+ case str():
965
+ # assume it's a JSON encoded CUDAStreamPoolConfig
966
+ try:
967
+ d = json.loads(user_cuda_stream_policy)
968
+ except json.JSONDecodeError:
969
+ raise ValueError(
970
+ f"Invalid CUDA stream policy: '{user_cuda_stream_policy}'"
971
+ ) from None
972
+ match d:
973
+ case {"pool_size": int(), "flags": int()}:
974
+ return CUDAStreamPoolConfig(
975
+ pool_size=d["pool_size"], flags=CudaStreamFlags(d["flags"])
976
+ )
977
+ case {"pool_size": int(), "flags": str()}:
978
+ # convert the string names to enums
979
+ return CUDAStreamPoolConfig(
980
+ pool_size=d["pool_size"],
981
+ flags=CudaStreamFlags(CudaStreamFlags.__members__[d["flags"]]),
982
+ )
983
+ case _:
984
+ try:
985
+ return CUDAStreamPoolConfig(**d)
986
+ except TypeError:
987
+ raise ValueError(
988
+ f"Invalid CUDA stream policy: {user_cuda_stream_policy}"
989
+ ) from None
633
990
 
634
991
 
635
992
  @dataclasses.dataclass(frozen=True, eq=True)
@@ -651,6 +1008,8 @@ class ConfigOptions:
651
1008
  device
652
1009
  The GPU used to run the query. If not provided, the
653
1010
  query uses the current CUDA device.
1011
+ cuda_stream_policy
1012
+ The policy to use for acquiring new CUDA streams. See :class:`~cudf_polars.utils.config.CUDAStreamPolicy` for more.
654
1013
  """
655
1014
 
656
1015
  raise_on_fail: bool = False
@@ -659,6 +1018,14 @@ class ConfigOptions:
659
1018
  default_factory=StreamingExecutor
660
1019
  )
661
1020
  device: int | None = None
1021
+ memory_resource_config: MemoryResourceConfig | None = None
1022
+ cuda_stream_policy: CUDAStreamPolicy | CUDAStreamPoolConfig = dataclasses.field(
1023
+ default_factory=_make_default_factory(
1024
+ "CUDF_POLARS__CUDA_STREAM_POLICY",
1025
+ CUDAStreamPolicy.__call__,
1026
+ default=CUDAStreamPolicy.DEFAULT,
1027
+ )
1028
+ )
662
1029
 
663
1030
  @classmethod
664
1031
  def from_polars_engine(
@@ -672,6 +1039,8 @@ class ConfigOptions:
672
1039
  "executor_options",
673
1040
  "parquet_options",
674
1041
  "raise_on_fail",
1042
+ "memory_resource_config",
1043
+ "cuda_stream_policy",
675
1044
  }
676
1045
 
677
1046
  extra_options = set(engine.config.keys()) - valid_options
@@ -684,8 +1053,20 @@ class ConfigOptions:
684
1053
  user_executor = os.environ.get(f"{env_prefix}__EXECUTOR", "streaming")
685
1054
  user_executor_options = engine.config.get("executor_options", {})
686
1055
  user_parquet_options = engine.config.get("parquet_options", {})
1056
+ if user_parquet_options is None:
1057
+ user_parquet_options = {}
687
1058
  # This is set in polars, and so can't be overridden by the environment
688
1059
  user_raise_on_fail = engine.config.get("raise_on_fail", False)
1060
+ user_memory_resource_config = engine.config.get("memory_resource_config", None)
1061
+ if user_memory_resource_config is None and (
1062
+ os.environ.get(f"{MemoryResourceConfig._env_prefix}__QUALNAME", "") != ""
1063
+ ):
1064
+ # We'll pick up the qualname / options from the environment.
1065
+ user_memory_resource_config = MemoryResourceConfig()
1066
+ elif isinstance(user_memory_resource_config, dict):
1067
+ user_memory_resource_config = MemoryResourceConfig(
1068
+ **user_memory_resource_config
1069
+ )
689
1070
 
690
1071
  # Backward compatibility for "cardinality_factor"
691
1072
  # TODO: Remove this in 25.10
@@ -717,7 +1098,7 @@ class ConfigOptions:
717
1098
  case "streaming":
718
1099
  user_executor_options = user_executor_options.copy()
719
1100
  # Handle the interaction between the default shuffle method, the
720
- # scheduler, and whether rapidsmpf is available.
1101
+ # cluster, and whether rapidsmpf is available.
721
1102
  env_shuffle_method = os.environ.get(
722
1103
  "CUDF_POLARS__EXECUTOR__SHUFFLE_METHOD", None
723
1104
  )
@@ -733,9 +1114,43 @@ class ConfigOptions:
733
1114
  case _: # pragma: no cover; Unreachable
734
1115
  raise ValueError(f"Unsupported executor: {user_executor}")
735
1116
 
736
- return cls(
737
- raise_on_fail=user_raise_on_fail,
738
- parquet_options=ParquetOptions(**user_parquet_options),
739
- executor=executor,
740
- device=engine.device,
741
- )
1117
+ kwargs = {
1118
+ "raise_on_fail": user_raise_on_fail,
1119
+ "parquet_options": ParquetOptions(**user_parquet_options),
1120
+ "executor": executor,
1121
+ "device": engine.device,
1122
+ "memory_resource_config": user_memory_resource_config,
1123
+ }
1124
+
1125
+ # Handle "cuda-stream-policy".
1126
+ # The default will depend on the runtime and executor.
1127
+ user_cuda_stream_policy = engine.config.get(
1128
+ "cuda_stream_policy", None
1129
+ ) or os.environ.get("CUDF_POLARS__CUDA_STREAM_POLICY", None)
1130
+
1131
+ cuda_stream_policy: CUDAStreamPolicy | CUDAStreamPoolConfig
1132
+
1133
+ if user_cuda_stream_policy is None:
1134
+ if (
1135
+ executor.name == "streaming" and executor.runtime == Runtime.RAPIDSMPF
1136
+ ): # pragma: no cover; requires rapidsmpf runtime
1137
+ # the rapidsmpf runtime defaults to using a stream pool
1138
+ cuda_stream_policy = CUDAStreamPoolConfig()
1139
+ else:
1140
+ # everything else defaults to the default stream
1141
+ cuda_stream_policy = CUDAStreamPolicy.DEFAULT
1142
+ else:
1143
+ cuda_stream_policy = _convert_cuda_stream_policy(user_cuda_stream_policy)
1144
+
1145
+ # Pool policy is only supported by the rapidsmpf runtime.
1146
+ if isinstance(cuda_stream_policy, CUDAStreamPoolConfig) and (
1147
+ (executor.name != "streaming")
1148
+ or (executor.name == "streaming" and executor.runtime != Runtime.RAPIDSMPF)
1149
+ ):
1150
+ raise ValueError(
1151
+ "CUDAStreamPolicy.POOL is only supported by the rapidsmpf runtime."
1152
+ )
1153
+
1154
+ kwargs["cuda_stream_policy"] = cuda_stream_policy
1155
+
1156
+ return cls(**kwargs)