cudf-polars-cu13 25.10.0__py3-none-any.whl → 26.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudf_polars/GIT_COMMIT +1 -1
- cudf_polars/VERSION +1 -1
- cudf_polars/callback.py +60 -15
- cudf_polars/containers/column.py +137 -77
- cudf_polars/containers/dataframe.py +123 -34
- cudf_polars/containers/datatype.py +134 -13
- cudf_polars/dsl/expr.py +0 -2
- cudf_polars/dsl/expressions/aggregation.py +80 -28
- cudf_polars/dsl/expressions/binaryop.py +34 -14
- cudf_polars/dsl/expressions/boolean.py +110 -37
- cudf_polars/dsl/expressions/datetime.py +59 -30
- cudf_polars/dsl/expressions/literal.py +11 -5
- cudf_polars/dsl/expressions/rolling.py +460 -119
- cudf_polars/dsl/expressions/selection.py +9 -8
- cudf_polars/dsl/expressions/slicing.py +1 -1
- cudf_polars/dsl/expressions/string.py +256 -114
- cudf_polars/dsl/expressions/struct.py +19 -7
- cudf_polars/dsl/expressions/ternary.py +33 -3
- cudf_polars/dsl/expressions/unary.py +126 -64
- cudf_polars/dsl/ir.py +1053 -350
- cudf_polars/dsl/to_ast.py +30 -13
- cudf_polars/dsl/tracing.py +194 -0
- cudf_polars/dsl/translate.py +307 -107
- cudf_polars/dsl/utils/aggregations.py +43 -30
- cudf_polars/dsl/utils/reshape.py +14 -2
- cudf_polars/dsl/utils/rolling.py +12 -8
- cudf_polars/dsl/utils/windows.py +35 -20
- cudf_polars/experimental/base.py +55 -2
- cudf_polars/experimental/benchmarks/pdsds.py +12 -126
- cudf_polars/experimental/benchmarks/pdsh.py +792 -2
- cudf_polars/experimental/benchmarks/utils.py +596 -39
- cudf_polars/experimental/dask_registers.py +47 -20
- cudf_polars/experimental/dispatch.py +9 -3
- cudf_polars/experimental/distinct.py +2 -0
- cudf_polars/experimental/explain.py +15 -2
- cudf_polars/experimental/expressions.py +30 -15
- cudf_polars/experimental/groupby.py +25 -4
- cudf_polars/experimental/io.py +156 -124
- cudf_polars/experimental/join.py +53 -23
- cudf_polars/experimental/parallel.py +68 -19
- cudf_polars/experimental/rapidsmpf/__init__.py +8 -0
- cudf_polars/experimental/rapidsmpf/collectives/__init__.py +9 -0
- cudf_polars/experimental/rapidsmpf/collectives/allgather.py +90 -0
- cudf_polars/experimental/rapidsmpf/collectives/common.py +96 -0
- cudf_polars/experimental/rapidsmpf/collectives/shuffle.py +253 -0
- cudf_polars/experimental/rapidsmpf/core.py +488 -0
- cudf_polars/experimental/rapidsmpf/dask.py +172 -0
- cudf_polars/experimental/rapidsmpf/dispatch.py +153 -0
- cudf_polars/experimental/rapidsmpf/io.py +696 -0
- cudf_polars/experimental/rapidsmpf/join.py +322 -0
- cudf_polars/experimental/rapidsmpf/lower.py +74 -0
- cudf_polars/experimental/rapidsmpf/nodes.py +735 -0
- cudf_polars/experimental/rapidsmpf/repartition.py +216 -0
- cudf_polars/experimental/rapidsmpf/union.py +115 -0
- cudf_polars/experimental/rapidsmpf/utils.py +374 -0
- cudf_polars/experimental/repartition.py +9 -2
- cudf_polars/experimental/select.py +177 -14
- cudf_polars/experimental/shuffle.py +46 -12
- cudf_polars/experimental/sort.py +100 -26
- cudf_polars/experimental/spilling.py +1 -1
- cudf_polars/experimental/statistics.py +24 -5
- cudf_polars/experimental/utils.py +25 -7
- cudf_polars/testing/asserts.py +13 -8
- cudf_polars/testing/io.py +2 -1
- cudf_polars/testing/plugin.py +93 -17
- cudf_polars/typing/__init__.py +86 -32
- cudf_polars/utils/config.py +473 -58
- cudf_polars/utils/cuda_stream.py +70 -0
- cudf_polars/utils/versions.py +5 -4
- cudf_polars_cu13-26.2.0.dist-info/METADATA +181 -0
- cudf_polars_cu13-26.2.0.dist-info/RECORD +108 -0
- {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/WHEEL +1 -1
- cudf_polars_cu13-25.10.0.dist-info/METADATA +0 -136
- cudf_polars_cu13-25.10.0.dist-info/RECORD +0 -92
- {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/licenses/LICENSE +0 -0
- {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/top_level.txt +0 -0
cudf_polars/utils/config.py
CHANGED
|
@@ -28,7 +28,10 @@ import importlib.util
|
|
|
28
28
|
import json
|
|
29
29
|
import os
|
|
30
30
|
import warnings
|
|
31
|
-
from typing import TYPE_CHECKING, Literal, TypeVar
|
|
31
|
+
from typing import TYPE_CHECKING, Any, Literal, TypeVar
|
|
32
|
+
|
|
33
|
+
from rmm.pylibrmm.cuda_stream import CudaStreamFlags
|
|
34
|
+
from rmm.pylibrmm.cuda_stream_pool import CudaStreamPool
|
|
32
35
|
|
|
33
36
|
if TYPE_CHECKING:
|
|
34
37
|
from collections.abc import Callable
|
|
@@ -37,13 +40,18 @@ if TYPE_CHECKING:
|
|
|
37
40
|
|
|
38
41
|
import polars.lazyframe.engine_config
|
|
39
42
|
|
|
43
|
+
import rmm.mr
|
|
44
|
+
|
|
40
45
|
|
|
41
46
|
__all__ = [
|
|
47
|
+
"Cluster",
|
|
42
48
|
"ConfigOptions",
|
|
43
49
|
"InMemoryExecutor",
|
|
44
50
|
"ParquetOptions",
|
|
45
|
-
"
|
|
51
|
+
"Runtime",
|
|
52
|
+
"Scheduler", # Deprecated, kept for backward compatibility
|
|
46
53
|
"ShuffleMethod",
|
|
54
|
+
"ShufflerInsertionMethod",
|
|
47
55
|
"StatsPlanningOptions",
|
|
48
56
|
"StreamingExecutor",
|
|
49
57
|
"StreamingFallbackMode",
|
|
@@ -57,8 +65,9 @@ def _env_get_int(name: str, default: int) -> int:
|
|
|
57
65
|
return default # pragma: no cover
|
|
58
66
|
|
|
59
67
|
|
|
60
|
-
|
|
61
|
-
|
|
68
|
+
@functools.cache
|
|
69
|
+
def get_device_handle() -> Any:
|
|
70
|
+
# Gets called for each IR.do_evaluate node, so we'll cache it.
|
|
62
71
|
import pynvml
|
|
63
72
|
|
|
64
73
|
try:
|
|
@@ -74,12 +83,27 @@ def get_total_device_memory() -> int | None:
|
|
|
74
83
|
handle = pynvml.nvmlDeviceGetDeviceHandleFromMigDeviceHandle(handle)
|
|
75
84
|
else:
|
|
76
85
|
handle = pynvml.nvmlDeviceGetHandleByIndex(int(index))
|
|
77
|
-
|
|
78
|
-
return pynvml.nvmlDeviceGetMemoryInfo(handle).total
|
|
79
|
-
|
|
80
86
|
except pynvml.NVMLError_NotSupported: # pragma: no cover
|
|
81
87
|
# System doesn't have proper "GPU memory".
|
|
82
88
|
return None
|
|
89
|
+
else:
|
|
90
|
+
return handle
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def get_total_device_memory() -> int | None:
|
|
94
|
+
"""Return the total memory of the current device."""
|
|
95
|
+
import pynvml
|
|
96
|
+
|
|
97
|
+
maybe_handle = get_device_handle()
|
|
98
|
+
|
|
99
|
+
if maybe_handle is not None:
|
|
100
|
+
try:
|
|
101
|
+
return pynvml.nvmlDeviceGetMemoryInfo(maybe_handle).total
|
|
102
|
+
except pynvml.NVMLError_NotSupported: # pragma: no cover
|
|
103
|
+
# System doesn't have proper "GPU memory".
|
|
104
|
+
return None
|
|
105
|
+
else: # pragma: no cover
|
|
106
|
+
return None
|
|
83
107
|
|
|
84
108
|
|
|
85
109
|
@functools.cache
|
|
@@ -120,14 +144,43 @@ class StreamingFallbackMode(str, enum.Enum):
|
|
|
120
144
|
SILENT = "silent"
|
|
121
145
|
|
|
122
146
|
|
|
147
|
+
class Runtime(str, enum.Enum):
|
|
148
|
+
"""
|
|
149
|
+
The runtime to use for the streaming executor.
|
|
150
|
+
|
|
151
|
+
* ``Runtime.TASKS`` : Use the task-based runtime.
|
|
152
|
+
This is the default runtime.
|
|
153
|
+
* ``Runtime.RAPIDSMPF`` : Use the coroutine-based streaming runtime (rapidsmpf).
|
|
154
|
+
This runtime is experimental.
|
|
155
|
+
"""
|
|
156
|
+
|
|
157
|
+
TASKS = "tasks"
|
|
158
|
+
RAPIDSMPF = "rapidsmpf"
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
class Cluster(str, enum.Enum):
|
|
162
|
+
"""
|
|
163
|
+
The cluster configuration for the streaming executor.
|
|
164
|
+
|
|
165
|
+
* ``Cluster.SINGLE`` : Single-GPU execution. Currently uses a zero-dependency,
|
|
166
|
+
synchronous, single-threaded task scheduler.
|
|
167
|
+
* ``Cluster.DISTRIBUTED`` : Multi-GPU distributed execution. Currently
|
|
168
|
+
uses a Dask-based distributed scheduler and requires an
|
|
169
|
+
active Dask cluster.
|
|
170
|
+
"""
|
|
171
|
+
|
|
172
|
+
SINGLE = "single"
|
|
173
|
+
DISTRIBUTED = "distributed"
|
|
174
|
+
|
|
175
|
+
|
|
123
176
|
class Scheduler(str, enum.Enum):
|
|
124
177
|
"""
|
|
125
|
-
|
|
178
|
+
**Deprecated**: Use :class:`Cluster` instead.
|
|
179
|
+
|
|
180
|
+
The scheduler to use for the task-based streaming executor.
|
|
126
181
|
|
|
127
|
-
* ``Scheduler.SYNCHRONOUS`` :
|
|
128
|
-
|
|
129
|
-
* ``Scheduler.DISTRIBUTED`` : A Dask-based distributed scheduler.
|
|
130
|
-
Using this scheduler requires an active Dask cluster.
|
|
182
|
+
* ``Scheduler.SYNCHRONOUS`` : Single-GPU execution (use ``Cluster.SINGLE`` instead)
|
|
183
|
+
* ``Scheduler.DISTRIBUTED`` : Multi-GPU execution (use ``Cluster.DISTRIBUTED`` instead)
|
|
131
184
|
"""
|
|
132
185
|
|
|
133
186
|
SYNCHRONOUS = "synchronous"
|
|
@@ -143,12 +196,12 @@ class ShuffleMethod(str, enum.Enum):
|
|
|
143
196
|
* ``ShuffleMethod._RAPIDSMPF_SINGLE`` : Use the single-process rapidsmpf shuffler.
|
|
144
197
|
|
|
145
198
|
With :class:`cudf_polars.utils.config.StreamingExecutor`, the default of ``None``
|
|
146
|
-
will attempt to use ``ShuffleMethod.RAPIDSMPF`` for
|
|
199
|
+
will attempt to use ``ShuffleMethod.RAPIDSMPF`` for a distributed cluster,
|
|
147
200
|
but will fall back to ``ShuffleMethod.TASKS`` if rapidsmpf is not installed.
|
|
148
201
|
|
|
149
202
|
The user should **not** specify ``ShuffleMethod._RAPIDSMPF_SINGLE`` directly.
|
|
150
203
|
A setting of ``ShuffleMethod.RAPIDSMPF`` will be converted to the single-process
|
|
151
|
-
shuffler automatically when
|
|
204
|
+
shuffler automatically when using single-GPU execution.
|
|
152
205
|
"""
|
|
153
206
|
|
|
154
207
|
TASKS = "tasks"
|
|
@@ -156,6 +209,20 @@ class ShuffleMethod(str, enum.Enum):
|
|
|
156
209
|
_RAPIDSMPF_SINGLE = "rapidsmpf-single"
|
|
157
210
|
|
|
158
211
|
|
|
212
|
+
class ShufflerInsertionMethod(str, enum.Enum):
|
|
213
|
+
"""
|
|
214
|
+
The method to use for inserting chunks into the rapidsmpf shuffler.
|
|
215
|
+
|
|
216
|
+
* ``ShufflerInsertionMethod.INSERT_CHUNKS`` : Use insert_chunks for inserting data.
|
|
217
|
+
* ``ShufflerInsertionMethod.CONCAT_INSERT`` : Use concat_insert for inserting data.
|
|
218
|
+
|
|
219
|
+
Only applicable with the "rapidsmpf" shuffle method and the "tasks" runtime.
|
|
220
|
+
"""
|
|
221
|
+
|
|
222
|
+
INSERT_CHUNKS = "insert_chunks"
|
|
223
|
+
CONCAT_INSERT = "concat_insert"
|
|
224
|
+
|
|
225
|
+
|
|
159
226
|
T = TypeVar("T")
|
|
160
227
|
|
|
161
228
|
|
|
@@ -215,6 +282,10 @@ class ParquetOptions:
|
|
|
215
282
|
|
|
216
283
|
Set to 0 to avoid row-group sampling. Note that row-group sampling
|
|
217
284
|
will also be skipped if ``max_footer_samples`` is 0.
|
|
285
|
+
use_rapidsmpf_native
|
|
286
|
+
Whether to use the native rapidsmpf node for parquet reading.
|
|
287
|
+
This option is only used when the rapidsmpf runtime is enabled.
|
|
288
|
+
Default is True.
|
|
218
289
|
"""
|
|
219
290
|
|
|
220
291
|
_env_prefix = "CUDF_POLARS__PARQUET_OPTIONS"
|
|
@@ -249,6 +320,13 @@ class ParquetOptions:
|
|
|
249
320
|
f"{_env_prefix}__MAX_ROW_GROUP_SAMPLES", int, default=1
|
|
250
321
|
)
|
|
251
322
|
)
|
|
323
|
+
use_rapidsmpf_native: bool = dataclasses.field(
|
|
324
|
+
default_factory=_make_default_factory(
|
|
325
|
+
f"{_env_prefix}__USE_RAPIDSMPF_NATIVE",
|
|
326
|
+
_bool_converter,
|
|
327
|
+
default=True,
|
|
328
|
+
)
|
|
329
|
+
)
|
|
252
330
|
|
|
253
331
|
def __post_init__(self) -> None: # noqa: D105
|
|
254
332
|
if not isinstance(self.chunked, bool):
|
|
@@ -263,9 +341,11 @@ class ParquetOptions:
|
|
|
263
341
|
raise TypeError("max_footer_samples must be an int")
|
|
264
342
|
if not isinstance(self.max_row_group_samples, int):
|
|
265
343
|
raise TypeError("max_row_group_samples must be an int")
|
|
344
|
+
if not isinstance(self.use_rapidsmpf_native, bool):
|
|
345
|
+
raise TypeError("use_rapidsmpf_native must be a bool")
|
|
266
346
|
|
|
267
347
|
|
|
268
|
-
def default_blocksize(
|
|
348
|
+
def default_blocksize(cluster: str) -> int:
|
|
269
349
|
"""Return the default blocksize."""
|
|
270
350
|
device_size = get_total_device_memory()
|
|
271
351
|
if device_size is None: # pragma: no cover
|
|
@@ -274,7 +354,7 @@ def default_blocksize(scheduler: str) -> int:
|
|
|
274
354
|
return 1_000_000_000
|
|
275
355
|
|
|
276
356
|
if (
|
|
277
|
-
|
|
357
|
+
cluster == "distributed"
|
|
278
358
|
or _env_get_int("POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY", default=1) == 0
|
|
279
359
|
):
|
|
280
360
|
# Distributed execution requires a conservative
|
|
@@ -370,6 +450,102 @@ class StatsPlanningOptions:
|
|
|
370
450
|
raise TypeError("default_selectivity must be a float")
|
|
371
451
|
|
|
372
452
|
|
|
453
|
+
@dataclasses.dataclass(frozen=True, eq=True)
|
|
454
|
+
class MemoryResourceConfig:
|
|
455
|
+
"""
|
|
456
|
+
Configuration for the default memory resource.
|
|
457
|
+
|
|
458
|
+
Parameters
|
|
459
|
+
----------
|
|
460
|
+
qualname
|
|
461
|
+
The fully qualified name of the memory resource class to use.
|
|
462
|
+
options
|
|
463
|
+
This can be either a dictionary representing the options to pass
|
|
464
|
+
to the memory resource class, or, a dictionary representing a
|
|
465
|
+
nested memory resource configuration. The presence of "qualname"
|
|
466
|
+
field indicates a nested memory resource configuration.
|
|
467
|
+
|
|
468
|
+
Examples
|
|
469
|
+
--------
|
|
470
|
+
Create a memory resource config for a single memory resource:
|
|
471
|
+
>>> MemoryResourceConfig(
|
|
472
|
+
... qualname="rmm.mr.CudaAsyncMemoryResource",
|
|
473
|
+
... options={"initial_pool_size": 100},
|
|
474
|
+
... )
|
|
475
|
+
|
|
476
|
+
Create a memory resource config for a nested memory resource configuration:
|
|
477
|
+
>>> MemoryResourceConfig(
|
|
478
|
+
... qualname="rmm.mr.PrefetchResourceAdaptor",
|
|
479
|
+
... options={
|
|
480
|
+
... "upstream_mr": {
|
|
481
|
+
... "qualname": "rmm.mr.PoolMemoryResource",
|
|
482
|
+
... "options": {
|
|
483
|
+
... "upstream_mr": {
|
|
484
|
+
... "qualname": "rmm.mr.ManagedMemoryResource",
|
|
485
|
+
... },
|
|
486
|
+
... "initial_pool_size": 256,
|
|
487
|
+
... },
|
|
488
|
+
... }
|
|
489
|
+
... },
|
|
490
|
+
... )
|
|
491
|
+
"""
|
|
492
|
+
|
|
493
|
+
_env_prefix = "CUDF_POLARS__MEMORY_RESOURCE_CONFIG"
|
|
494
|
+
qualname: str = dataclasses.field(
|
|
495
|
+
default_factory=_make_default_factory(
|
|
496
|
+
f"{_env_prefix}__QUALNAME",
|
|
497
|
+
str,
|
|
498
|
+
# We shouldn't reach here if qualname isn't set in the environment.
|
|
499
|
+
default=None, # type: ignore[assignment]
|
|
500
|
+
)
|
|
501
|
+
)
|
|
502
|
+
options: dict[str, Any] | None = dataclasses.field(
|
|
503
|
+
default_factory=_make_default_factory(
|
|
504
|
+
f"{_env_prefix}__OPTIONS",
|
|
505
|
+
json.loads,
|
|
506
|
+
default=None,
|
|
507
|
+
)
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
def __post_init__(self) -> None:
|
|
511
|
+
if self.qualname.count(".") < 1:
|
|
512
|
+
raise ValueError(
|
|
513
|
+
f"MemoryResourceConfig.qualname '{self.qualname}' must be a fully qualified name to a class, including the module name."
|
|
514
|
+
)
|
|
515
|
+
|
|
516
|
+
def create_memory_resource(self) -> rmm.mr.DeviceMemoryResource:
|
|
517
|
+
"""Create a memory resource from the configuration."""
|
|
518
|
+
|
|
519
|
+
def create_mr(
|
|
520
|
+
qualname: str, options: dict[str, Any] | None
|
|
521
|
+
) -> rmm.mr.DeviceMemoryResource:
|
|
522
|
+
module_name, class_name = qualname.rsplit(".", 1)
|
|
523
|
+
module = importlib.import_module(module_name)
|
|
524
|
+
cls = getattr(module, class_name)
|
|
525
|
+
return cls(**options or {})
|
|
526
|
+
|
|
527
|
+
def process_options(opts: dict[str, Any] | None) -> dict[str, Any]:
|
|
528
|
+
if opts is None:
|
|
529
|
+
return {}
|
|
530
|
+
|
|
531
|
+
processed = {}
|
|
532
|
+
for key, value in opts.items():
|
|
533
|
+
if isinstance(value, dict) and "qualname" in value:
|
|
534
|
+
# This is a nested memory resource config
|
|
535
|
+
nested_qualname = value["qualname"]
|
|
536
|
+
nested_options = process_options(value.get("options"))
|
|
537
|
+
processed[key] = create_mr(nested_qualname, nested_options)
|
|
538
|
+
else:
|
|
539
|
+
processed[key] = value
|
|
540
|
+
return processed
|
|
541
|
+
|
|
542
|
+
# Create the top-level memory resource
|
|
543
|
+
return create_mr(self.qualname, process_options(self.options))
|
|
544
|
+
|
|
545
|
+
def __hash__(self) -> int:
|
|
546
|
+
return hash((self.qualname, json.dumps(self.options, sort_keys=True)))
|
|
547
|
+
|
|
548
|
+
|
|
373
549
|
@dataclasses.dataclass(frozen=True, eq=True)
|
|
374
550
|
class StreamingExecutor:
|
|
375
551
|
"""
|
|
@@ -380,11 +556,25 @@ class StreamingExecutor:
|
|
|
380
556
|
|
|
381
557
|
Parameters
|
|
382
558
|
----------
|
|
559
|
+
runtime
|
|
560
|
+
The runtime to use for the streaming executor.
|
|
561
|
+
``Runtime.TASKS`` by default.
|
|
562
|
+
cluster
|
|
563
|
+
The cluster configuration for the streaming executor.
|
|
564
|
+
``Cluster.SINGLE`` by default.
|
|
565
|
+
|
|
566
|
+
This setting applies to both task-based and rapidsmpf execution modes:
|
|
567
|
+
|
|
568
|
+
* ``Cluster.SINGLE``: Single-GPU execution
|
|
569
|
+
* ``Cluster.DISTRIBUTED``: Multi-GPU distributed execution (requires
|
|
570
|
+
an active Dask cluster)
|
|
571
|
+
|
|
383
572
|
scheduler
|
|
384
|
-
|
|
385
|
-
by default.
|
|
573
|
+
**Deprecated**: Use ``cluster`` instead.
|
|
386
574
|
|
|
387
|
-
|
|
575
|
+
For backward compatibility:
|
|
576
|
+
* ``Scheduler.SYNCHRONOUS`` maps to ``Cluster.SINGLE``
|
|
577
|
+
* ``Scheduler.DISTRIBUTED`` maps to ``Cluster.DISTRIBUTED``
|
|
388
578
|
fallback_mode
|
|
389
579
|
How to handle errors when the GPU engine fails to execute a query.
|
|
390
580
|
``StreamingFallbackMode.WARN`` by default.
|
|
@@ -414,10 +604,10 @@ class StreamingExecutor:
|
|
|
414
604
|
- the ``CUDF_POLARS__EXECUTOR__TARGET_PARTITION_SIZE`` environment variable
|
|
415
605
|
|
|
416
606
|
By default, cudf-polars uses a target partition size that's a fraction
|
|
417
|
-
of the device memory, where the fraction depends on the
|
|
607
|
+
of the device memory, where the fraction depends on the cluster:
|
|
418
608
|
|
|
419
609
|
- distributed: 1/40th of the device memory
|
|
420
|
-
-
|
|
610
|
+
- single: 1/16th of the device memory
|
|
421
611
|
|
|
422
612
|
The optional pynvml dependency is used to query the device memory size. If
|
|
423
613
|
pynvml is not available, a warning is emitted and the device size is assumed
|
|
@@ -434,36 +624,62 @@ class StreamingExecutor:
|
|
|
434
624
|
a broadcast join.
|
|
435
625
|
shuffle_method
|
|
436
626
|
The method to use for shuffling data between workers. Defaults to
|
|
437
|
-
'rapidsmpf' for distributed
|
|
438
|
-
and 'tasks' for
|
|
627
|
+
'rapidsmpf' for distributed cluster if available (otherwise 'tasks'),
|
|
628
|
+
and 'tasks' for single-GPU cluster.
|
|
629
|
+
shuffler_insertion_method
|
|
630
|
+
The method to use for inserting chunks with the rapidsmpf shuffler.
|
|
631
|
+
Can be 'insert_chunks' (default) or 'concat_insert'.
|
|
632
|
+
|
|
633
|
+
Only applicable with ``shuffle_method="rapidsmpf"`` and ``runtime="tasks"``.
|
|
439
634
|
rapidsmpf_spill
|
|
440
635
|
Whether to wrap task arguments and output in objects that are
|
|
441
636
|
spillable by 'rapidsmpf'.
|
|
637
|
+
client_device_threshold
|
|
638
|
+
Threshold for spilling data from device memory in rapidsmpf.
|
|
639
|
+
Default is 50% of device memory on the client process.
|
|
640
|
+
This argument is only used by the "rapidsmpf" runtime.
|
|
442
641
|
sink_to_directory
|
|
443
642
|
Whether multi-partition sink operations should write to a directory
|
|
444
643
|
rather than a single file. By default, this will be set to True for
|
|
445
|
-
the 'distributed'
|
|
446
|
-
|
|
644
|
+
the 'distributed' cluster and False otherwise. The 'distributed'
|
|
645
|
+
cluster does not currently support ``sink_to_directory=False``.
|
|
447
646
|
stats_planning
|
|
448
647
|
Options controlling statistics-based query planning. See
|
|
449
648
|
:class:`~cudf_polars.utils.config.StatsPlanningOptions` for more.
|
|
649
|
+
max_io_threads
|
|
650
|
+
Maximum number of IO threads for the rapidsmpf runtime. Default is 2.
|
|
651
|
+
This controls the parallelism of IO operations when reading data.
|
|
450
652
|
|
|
451
653
|
Notes
|
|
452
654
|
-----
|
|
453
655
|
The streaming executor does not currently support profiling a query via
|
|
454
656
|
the ``.profile()`` method. We recommend using nsys to profile queries
|
|
455
|
-
with
|
|
456
|
-
with
|
|
657
|
+
with single-GPU execution and Dask's built-in profiling tools
|
|
658
|
+
with distributed execution.
|
|
457
659
|
"""
|
|
458
660
|
|
|
459
661
|
_env_prefix = "CUDF_POLARS__EXECUTOR"
|
|
460
662
|
|
|
461
663
|
name: Literal["streaming"] = dataclasses.field(default="streaming", init=False)
|
|
462
|
-
|
|
664
|
+
runtime: Runtime = dataclasses.field(
|
|
665
|
+
default_factory=_make_default_factory(
|
|
666
|
+
f"{_env_prefix}__RUNTIME",
|
|
667
|
+
Runtime.__call__,
|
|
668
|
+
default=Runtime.TASKS,
|
|
669
|
+
)
|
|
670
|
+
)
|
|
671
|
+
cluster: Cluster | None = dataclasses.field(
|
|
672
|
+
default_factory=_make_default_factory(
|
|
673
|
+
f"{_env_prefix}__CLUSTER",
|
|
674
|
+
Cluster.__call__,
|
|
675
|
+
default=None,
|
|
676
|
+
)
|
|
677
|
+
)
|
|
678
|
+
scheduler: Scheduler | None = dataclasses.field(
|
|
463
679
|
default_factory=_make_default_factory(
|
|
464
680
|
f"{_env_prefix}__SCHEDULER",
|
|
465
681
|
Scheduler.__call__,
|
|
466
|
-
default=
|
|
682
|
+
default=None,
|
|
467
683
|
)
|
|
468
684
|
)
|
|
469
685
|
fallback_mode: StreamingFallbackMode = dataclasses.field(
|
|
@@ -505,11 +721,23 @@ class StreamingExecutor:
|
|
|
505
721
|
default=ShuffleMethod.TASKS,
|
|
506
722
|
)
|
|
507
723
|
)
|
|
724
|
+
shuffler_insertion_method: ShufflerInsertionMethod = dataclasses.field(
|
|
725
|
+
default_factory=_make_default_factory(
|
|
726
|
+
f"{_env_prefix}__SHUFFLER_INSERTION_METHOD",
|
|
727
|
+
ShufflerInsertionMethod.__call__,
|
|
728
|
+
default=ShufflerInsertionMethod.INSERT_CHUNKS,
|
|
729
|
+
)
|
|
730
|
+
)
|
|
508
731
|
rapidsmpf_spill: bool = dataclasses.field(
|
|
509
732
|
default_factory=_make_default_factory(
|
|
510
733
|
f"{_env_prefix}__RAPIDSMPF_SPILL", _bool_converter, default=False
|
|
511
734
|
)
|
|
512
735
|
)
|
|
736
|
+
client_device_threshold: float = dataclasses.field(
|
|
737
|
+
default_factory=_make_default_factory(
|
|
738
|
+
f"{_env_prefix}__CLIENT_DEVICE_THRESHOLD", float, default=0.5
|
|
739
|
+
)
|
|
740
|
+
)
|
|
513
741
|
sink_to_directory: bool | None = dataclasses.field(
|
|
514
742
|
default_factory=_make_default_factory(
|
|
515
743
|
f"{_env_prefix}__SINK_TO_DIRECTORY", _bool_converter, default=None
|
|
@@ -518,12 +746,64 @@ class StreamingExecutor:
|
|
|
518
746
|
stats_planning: StatsPlanningOptions = dataclasses.field(
|
|
519
747
|
default_factory=StatsPlanningOptions
|
|
520
748
|
)
|
|
749
|
+
max_io_threads: int = dataclasses.field(
|
|
750
|
+
default_factory=_make_default_factory(
|
|
751
|
+
f"{_env_prefix}__MAX_IO_THREADS", int, default=2
|
|
752
|
+
)
|
|
753
|
+
)
|
|
521
754
|
|
|
522
755
|
def __post_init__(self) -> None: # noqa: D105
|
|
756
|
+
# Check for rapidsmpf runtime
|
|
757
|
+
if self.runtime == "rapidsmpf": # pragma: no cover; requires rapidsmpf runtime
|
|
758
|
+
if not rapidsmpf_single_available():
|
|
759
|
+
raise ValueError("The rapidsmpf streaming engine requires rapidsmpf.")
|
|
760
|
+
if self.shuffle_method == "tasks":
|
|
761
|
+
raise ValueError(
|
|
762
|
+
"The rapidsmpf streaming engine does not support task-based shuffling."
|
|
763
|
+
)
|
|
764
|
+
object.__setattr__(self, "shuffle_method", "rapidsmpf")
|
|
765
|
+
|
|
766
|
+
# Handle backward compatibility for deprecated scheduler parameter
|
|
767
|
+
if self.scheduler is not None:
|
|
768
|
+
if self.cluster is not None:
|
|
769
|
+
raise ValueError(
|
|
770
|
+
"Cannot specify both 'scheduler' and 'cluster'. "
|
|
771
|
+
"The 'scheduler' parameter is deprecated. "
|
|
772
|
+
"Please use only 'cluster' instead."
|
|
773
|
+
)
|
|
774
|
+
else:
|
|
775
|
+
warnings.warn(
|
|
776
|
+
"""The 'scheduler' parameter is deprecated. Please use 'cluster' instead.
|
|
777
|
+
Use 'cluster="single"' instead of 'scheduler="synchronous"' and "
|
|
778
|
+
'cluster="distributed"' instead of 'scheduler="distributed"'.""",
|
|
779
|
+
FutureWarning,
|
|
780
|
+
stacklevel=2,
|
|
781
|
+
)
|
|
782
|
+
# Map old scheduler values to new cluster values
|
|
783
|
+
if self.scheduler == "synchronous":
|
|
784
|
+
object.__setattr__(self, "cluster", Cluster.SINGLE)
|
|
785
|
+
elif self.scheduler == "distributed":
|
|
786
|
+
object.__setattr__(self, "cluster", Cluster.DISTRIBUTED)
|
|
787
|
+
# Clear scheduler to avoid confusion
|
|
788
|
+
object.__setattr__(self, "scheduler", None)
|
|
789
|
+
elif self.cluster is None:
|
|
790
|
+
object.__setattr__(self, "cluster", Cluster.SINGLE)
|
|
791
|
+
assert self.cluster is not None, "Expected cluster to be set."
|
|
792
|
+
|
|
793
|
+
# Warn loudly that multi-GPU execution is under construction
|
|
794
|
+
# for the rapidsmpf runtime
|
|
795
|
+
if self.cluster == "distributed" and self.runtime == "rapidsmpf":
|
|
796
|
+
warnings.warn(
|
|
797
|
+
"UNDER CONSTRUCTION!!!"
|
|
798
|
+
"The rapidsmpf runtime does NOT support distributed execution yet. "
|
|
799
|
+
"Use at your own risk!!!",
|
|
800
|
+
stacklevel=2,
|
|
801
|
+
)
|
|
802
|
+
|
|
523
803
|
# Handle shuffle_method defaults for streaming executor
|
|
524
804
|
if self.shuffle_method is None:
|
|
525
|
-
if self.
|
|
526
|
-
# For distributed
|
|
805
|
+
if self.cluster == "distributed" and rapidsmpf_distributed_available():
|
|
806
|
+
# For distributed cluster, prefer rapidsmpf if available
|
|
527
807
|
object.__setattr__(self, "shuffle_method", "rapidsmpf")
|
|
528
808
|
else:
|
|
529
809
|
# Otherwise, use task-based shuffle for now.
|
|
@@ -534,19 +814,16 @@ class StreamingExecutor:
|
|
|
534
814
|
raise ValueError("rapidsmpf-single is not a supported shuffle method.")
|
|
535
815
|
elif self.shuffle_method == "rapidsmpf":
|
|
536
816
|
# Check that we have rapidsmpf installed
|
|
537
|
-
if (
|
|
538
|
-
self.scheduler == "distributed"
|
|
539
|
-
and not rapidsmpf_distributed_available()
|
|
540
|
-
):
|
|
817
|
+
if self.cluster == "distributed" and not rapidsmpf_distributed_available():
|
|
541
818
|
raise ValueError(
|
|
542
819
|
"rapidsmpf shuffle method requested, but rapidsmpf.integrations.dask is not installed."
|
|
543
820
|
)
|
|
544
|
-
elif self.
|
|
821
|
+
elif self.cluster == "single" and not rapidsmpf_single_available():
|
|
545
822
|
raise ValueError(
|
|
546
823
|
"rapidsmpf shuffle method requested, but rapidsmpf is not installed."
|
|
547
824
|
)
|
|
548
|
-
# Select "rapidsmpf-single" for
|
|
549
|
-
if self.
|
|
825
|
+
# Select "rapidsmpf-single" for single-GPU
|
|
826
|
+
if self.cluster == "single":
|
|
550
827
|
object.__setattr__(self, "shuffle_method", "rapidsmpf-single")
|
|
551
828
|
|
|
552
829
|
# frozen dataclass, so use object.__setattr__
|
|
@@ -555,17 +832,24 @@ class StreamingExecutor:
|
|
|
555
832
|
)
|
|
556
833
|
if self.target_partition_size == 0:
|
|
557
834
|
object.__setattr__(
|
|
558
|
-
self,
|
|
835
|
+
self,
|
|
836
|
+
"target_partition_size",
|
|
837
|
+
default_blocksize(self.cluster),
|
|
559
838
|
)
|
|
560
839
|
if self.broadcast_join_limit == 0:
|
|
561
840
|
object.__setattr__(
|
|
562
841
|
self,
|
|
563
842
|
"broadcast_join_limit",
|
|
564
|
-
# Usually better to avoid shuffling for single gpu
|
|
565
|
-
2 if self.
|
|
843
|
+
# Usually better to avoid shuffling for single gpu with UVM
|
|
844
|
+
2 if self.cluster == "distributed" else 32,
|
|
566
845
|
)
|
|
567
|
-
object.__setattr__(self, "
|
|
846
|
+
object.__setattr__(self, "cluster", Cluster(self.cluster))
|
|
568
847
|
object.__setattr__(self, "shuffle_method", ShuffleMethod(self.shuffle_method))
|
|
848
|
+
object.__setattr__(
|
|
849
|
+
self,
|
|
850
|
+
"shuffler_insertion_method",
|
|
851
|
+
ShufflerInsertionMethod(self.shuffler_insertion_method),
|
|
852
|
+
)
|
|
569
853
|
|
|
570
854
|
# Make sure stats_planning is a dataclass
|
|
571
855
|
if isinstance(self.stats_planning, dict):
|
|
@@ -575,10 +859,10 @@ class StreamingExecutor:
|
|
|
575
859
|
StatsPlanningOptions(**self.stats_planning),
|
|
576
860
|
)
|
|
577
861
|
|
|
578
|
-
if self.
|
|
862
|
+
if self.cluster == "distributed":
|
|
579
863
|
if self.sink_to_directory is False:
|
|
580
864
|
raise ValueError(
|
|
581
|
-
"The distributed
|
|
865
|
+
"The distributed cluster requires sink_to_directory=True"
|
|
582
866
|
)
|
|
583
867
|
object.__setattr__(self, "sink_to_directory", True)
|
|
584
868
|
elif self.sink_to_directory is None:
|
|
@@ -599,13 +883,17 @@ class StreamingExecutor:
|
|
|
599
883
|
raise TypeError("rapidsmpf_spill must be bool")
|
|
600
884
|
if not isinstance(self.sink_to_directory, bool):
|
|
601
885
|
raise TypeError("sink_to_directory must be bool")
|
|
886
|
+
if not isinstance(self.client_device_threshold, float):
|
|
887
|
+
raise TypeError("client_device_threshold must be a float")
|
|
888
|
+
if not isinstance(self.max_io_threads, int):
|
|
889
|
+
raise TypeError("max_io_threads must be an int")
|
|
602
890
|
|
|
603
|
-
# RapidsMPF spill is only supported for
|
|
891
|
+
# RapidsMPF spill is only supported for distributed clusters for now.
|
|
604
892
|
# This is because the spilling API is still within the RMPF-Dask integration.
|
|
605
893
|
# (See https://github.com/rapidsai/rapidsmpf/issues/439)
|
|
606
|
-
if self.
|
|
894
|
+
if self.cluster == "single" and self.rapidsmpf_spill: # pragma: no cover
|
|
607
895
|
raise ValueError(
|
|
608
|
-
"rapidsmpf_spill is not supported for
|
|
896
|
+
"rapidsmpf_spill is not supported for single-GPU execution."
|
|
609
897
|
)
|
|
610
898
|
|
|
611
899
|
def __hash__(self) -> int: # noqa: D105
|
|
@@ -622,14 +910,83 @@ class InMemoryExecutor:
|
|
|
622
910
|
"""
|
|
623
911
|
Configuration for the cudf-polars in-memory executor.
|
|
624
912
|
|
|
913
|
+
The in-memory executor only supports single-GPU execution.
|
|
914
|
+
"""
|
|
915
|
+
|
|
916
|
+
name: Literal["in-memory"] = dataclasses.field(default="in-memory", init=False)
|
|
917
|
+
|
|
918
|
+
|
|
919
|
+
@dataclasses.dataclass(frozen=True, eq=True)
|
|
920
|
+
class CUDAStreamPoolConfig:
|
|
921
|
+
"""
|
|
922
|
+
Configuration for the CUDA stream pool.
|
|
923
|
+
|
|
625
924
|
Parameters
|
|
626
925
|
----------
|
|
627
|
-
|
|
628
|
-
The
|
|
629
|
-
|
|
926
|
+
pool_size
|
|
927
|
+
The size of the CUDA stream pool.
|
|
928
|
+
flags
|
|
929
|
+
The flags to use for the CUDA stream pool.
|
|
630
930
|
"""
|
|
631
931
|
|
|
632
|
-
|
|
932
|
+
pool_size: int = 16
|
|
933
|
+
flags: CudaStreamFlags = CudaStreamFlags.NON_BLOCKING
|
|
934
|
+
|
|
935
|
+
def build(self) -> CudaStreamPool:
|
|
936
|
+
return CudaStreamPool(
|
|
937
|
+
pool_size=self.pool_size,
|
|
938
|
+
flags=self.flags,
|
|
939
|
+
)
|
|
940
|
+
|
|
941
|
+
|
|
942
|
+
class CUDAStreamPolicy(str, enum.Enum):
|
|
943
|
+
"""
|
|
944
|
+
The policy to use for acquiring new CUDA streams.
|
|
945
|
+
|
|
946
|
+
* ``CUDAStreamPolicy.DEFAULT`` : Use the default CUDA stream.
|
|
947
|
+
* ``CUDAStreamPolicy.NEW`` : Create a new CUDA stream.
|
|
948
|
+
"""
|
|
949
|
+
|
|
950
|
+
DEFAULT = "default"
|
|
951
|
+
NEW = "new"
|
|
952
|
+
|
|
953
|
+
|
|
954
|
+
def _convert_cuda_stream_policy(
|
|
955
|
+
user_cuda_stream_policy: dict | str,
|
|
956
|
+
) -> CUDAStreamPolicy | CUDAStreamPoolConfig:
|
|
957
|
+
match user_cuda_stream_policy:
|
|
958
|
+
case "default" | "new":
|
|
959
|
+
return CUDAStreamPolicy(user_cuda_stream_policy)
|
|
960
|
+
case "pool":
|
|
961
|
+
return CUDAStreamPoolConfig()
|
|
962
|
+
case dict():
|
|
963
|
+
return CUDAStreamPoolConfig(**user_cuda_stream_policy)
|
|
964
|
+
case str():
|
|
965
|
+
# assume it's a JSON encoded CUDAStreamPoolConfig
|
|
966
|
+
try:
|
|
967
|
+
d = json.loads(user_cuda_stream_policy)
|
|
968
|
+
except json.JSONDecodeError:
|
|
969
|
+
raise ValueError(
|
|
970
|
+
f"Invalid CUDA stream policy: '{user_cuda_stream_policy}'"
|
|
971
|
+
) from None
|
|
972
|
+
match d:
|
|
973
|
+
case {"pool_size": int(), "flags": int()}:
|
|
974
|
+
return CUDAStreamPoolConfig(
|
|
975
|
+
pool_size=d["pool_size"], flags=CudaStreamFlags(d["flags"])
|
|
976
|
+
)
|
|
977
|
+
case {"pool_size": int(), "flags": str()}:
|
|
978
|
+
# convert the string names to enums
|
|
979
|
+
return CUDAStreamPoolConfig(
|
|
980
|
+
pool_size=d["pool_size"],
|
|
981
|
+
flags=CudaStreamFlags(CudaStreamFlags.__members__[d["flags"]]),
|
|
982
|
+
)
|
|
983
|
+
case _:
|
|
984
|
+
try:
|
|
985
|
+
return CUDAStreamPoolConfig(**d)
|
|
986
|
+
except TypeError:
|
|
987
|
+
raise ValueError(
|
|
988
|
+
f"Invalid CUDA stream policy: {user_cuda_stream_policy}"
|
|
989
|
+
) from None
|
|
633
990
|
|
|
634
991
|
|
|
635
992
|
@dataclasses.dataclass(frozen=True, eq=True)
|
|
@@ -651,6 +1008,8 @@ class ConfigOptions:
|
|
|
651
1008
|
device
|
|
652
1009
|
The GPU used to run the query. If not provided, the
|
|
653
1010
|
query uses the current CUDA device.
|
|
1011
|
+
cuda_stream_policy
|
|
1012
|
+
The policy to use for acquiring new CUDA streams. See :class:`~cudf_polars.utils.config.CUDAStreamPolicy` for more.
|
|
654
1013
|
"""
|
|
655
1014
|
|
|
656
1015
|
raise_on_fail: bool = False
|
|
@@ -659,6 +1018,14 @@ class ConfigOptions:
|
|
|
659
1018
|
default_factory=StreamingExecutor
|
|
660
1019
|
)
|
|
661
1020
|
device: int | None = None
|
|
1021
|
+
memory_resource_config: MemoryResourceConfig | None = None
|
|
1022
|
+
cuda_stream_policy: CUDAStreamPolicy | CUDAStreamPoolConfig = dataclasses.field(
|
|
1023
|
+
default_factory=_make_default_factory(
|
|
1024
|
+
"CUDF_POLARS__CUDA_STREAM_POLICY",
|
|
1025
|
+
CUDAStreamPolicy.__call__,
|
|
1026
|
+
default=CUDAStreamPolicy.DEFAULT,
|
|
1027
|
+
)
|
|
1028
|
+
)
|
|
662
1029
|
|
|
663
1030
|
@classmethod
|
|
664
1031
|
def from_polars_engine(
|
|
@@ -672,6 +1039,8 @@ class ConfigOptions:
|
|
|
672
1039
|
"executor_options",
|
|
673
1040
|
"parquet_options",
|
|
674
1041
|
"raise_on_fail",
|
|
1042
|
+
"memory_resource_config",
|
|
1043
|
+
"cuda_stream_policy",
|
|
675
1044
|
}
|
|
676
1045
|
|
|
677
1046
|
extra_options = set(engine.config.keys()) - valid_options
|
|
@@ -684,8 +1053,20 @@ class ConfigOptions:
|
|
|
684
1053
|
user_executor = os.environ.get(f"{env_prefix}__EXECUTOR", "streaming")
|
|
685
1054
|
user_executor_options = engine.config.get("executor_options", {})
|
|
686
1055
|
user_parquet_options = engine.config.get("parquet_options", {})
|
|
1056
|
+
if user_parquet_options is None:
|
|
1057
|
+
user_parquet_options = {}
|
|
687
1058
|
# This is set in polars, and so can't be overridden by the environment
|
|
688
1059
|
user_raise_on_fail = engine.config.get("raise_on_fail", False)
|
|
1060
|
+
user_memory_resource_config = engine.config.get("memory_resource_config", None)
|
|
1061
|
+
if user_memory_resource_config is None and (
|
|
1062
|
+
os.environ.get(f"{MemoryResourceConfig._env_prefix}__QUALNAME", "") != ""
|
|
1063
|
+
):
|
|
1064
|
+
# We'll pick up the qualname / options from the environment.
|
|
1065
|
+
user_memory_resource_config = MemoryResourceConfig()
|
|
1066
|
+
elif isinstance(user_memory_resource_config, dict):
|
|
1067
|
+
user_memory_resource_config = MemoryResourceConfig(
|
|
1068
|
+
**user_memory_resource_config
|
|
1069
|
+
)
|
|
689
1070
|
|
|
690
1071
|
# Backward compatibility for "cardinality_factor"
|
|
691
1072
|
# TODO: Remove this in 25.10
|
|
@@ -717,7 +1098,7 @@ class ConfigOptions:
|
|
|
717
1098
|
case "streaming":
|
|
718
1099
|
user_executor_options = user_executor_options.copy()
|
|
719
1100
|
# Handle the interaction between the default shuffle method, the
|
|
720
|
-
#
|
|
1101
|
+
# cluster, and whether rapidsmpf is available.
|
|
721
1102
|
env_shuffle_method = os.environ.get(
|
|
722
1103
|
"CUDF_POLARS__EXECUTOR__SHUFFLE_METHOD", None
|
|
723
1104
|
)
|
|
@@ -733,9 +1114,43 @@ class ConfigOptions:
|
|
|
733
1114
|
case _: # pragma: no cover; Unreachable
|
|
734
1115
|
raise ValueError(f"Unsupported executor: {user_executor}")
|
|
735
1116
|
|
|
736
|
-
|
|
737
|
-
raise_on_fail
|
|
738
|
-
parquet_options
|
|
739
|
-
executor
|
|
740
|
-
device
|
|
741
|
-
|
|
1117
|
+
kwargs = {
|
|
1118
|
+
"raise_on_fail": user_raise_on_fail,
|
|
1119
|
+
"parquet_options": ParquetOptions(**user_parquet_options),
|
|
1120
|
+
"executor": executor,
|
|
1121
|
+
"device": engine.device,
|
|
1122
|
+
"memory_resource_config": user_memory_resource_config,
|
|
1123
|
+
}
|
|
1124
|
+
|
|
1125
|
+
# Handle "cuda-stream-policy".
|
|
1126
|
+
# The default will depend on the runtime and executor.
|
|
1127
|
+
user_cuda_stream_policy = engine.config.get(
|
|
1128
|
+
"cuda_stream_policy", None
|
|
1129
|
+
) or os.environ.get("CUDF_POLARS__CUDA_STREAM_POLICY", None)
|
|
1130
|
+
|
|
1131
|
+
cuda_stream_policy: CUDAStreamPolicy | CUDAStreamPoolConfig
|
|
1132
|
+
|
|
1133
|
+
if user_cuda_stream_policy is None:
|
|
1134
|
+
if (
|
|
1135
|
+
executor.name == "streaming" and executor.runtime == Runtime.RAPIDSMPF
|
|
1136
|
+
): # pragma: no cover; requires rapidsmpf runtime
|
|
1137
|
+
# the rapidsmpf runtime defaults to using a stream pool
|
|
1138
|
+
cuda_stream_policy = CUDAStreamPoolConfig()
|
|
1139
|
+
else:
|
|
1140
|
+
# everything else defaults to the default stream
|
|
1141
|
+
cuda_stream_policy = CUDAStreamPolicy.DEFAULT
|
|
1142
|
+
else:
|
|
1143
|
+
cuda_stream_policy = _convert_cuda_stream_policy(user_cuda_stream_policy)
|
|
1144
|
+
|
|
1145
|
+
# Pool policy is only supported by the rapidsmpf runtime.
|
|
1146
|
+
if isinstance(cuda_stream_policy, CUDAStreamPoolConfig) and (
|
|
1147
|
+
(executor.name != "streaming")
|
|
1148
|
+
or (executor.name == "streaming" and executor.runtime != Runtime.RAPIDSMPF)
|
|
1149
|
+
):
|
|
1150
|
+
raise ValueError(
|
|
1151
|
+
"CUDAStreamPolicy.POOL is only supported by the rapidsmpf runtime."
|
|
1152
|
+
)
|
|
1153
|
+
|
|
1154
|
+
kwargs["cuda_stream_policy"] = cuda_stream_policy
|
|
1155
|
+
|
|
1156
|
+
return cls(**kwargs)
|