cudf-polars-cu13 25.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudf_polars/GIT_COMMIT +1 -0
- cudf_polars/VERSION +1 -0
- cudf_polars/__init__.py +28 -0
- cudf_polars/_version.py +21 -0
- cudf_polars/callback.py +318 -0
- cudf_polars/containers/__init__.py +13 -0
- cudf_polars/containers/column.py +495 -0
- cudf_polars/containers/dataframe.py +361 -0
- cudf_polars/containers/datatype.py +137 -0
- cudf_polars/dsl/__init__.py +8 -0
- cudf_polars/dsl/expr.py +66 -0
- cudf_polars/dsl/expressions/__init__.py +8 -0
- cudf_polars/dsl/expressions/aggregation.py +226 -0
- cudf_polars/dsl/expressions/base.py +272 -0
- cudf_polars/dsl/expressions/binaryop.py +120 -0
- cudf_polars/dsl/expressions/boolean.py +326 -0
- cudf_polars/dsl/expressions/datetime.py +271 -0
- cudf_polars/dsl/expressions/literal.py +97 -0
- cudf_polars/dsl/expressions/rolling.py +643 -0
- cudf_polars/dsl/expressions/selection.py +74 -0
- cudf_polars/dsl/expressions/slicing.py +46 -0
- cudf_polars/dsl/expressions/sorting.py +85 -0
- cudf_polars/dsl/expressions/string.py +1002 -0
- cudf_polars/dsl/expressions/struct.py +137 -0
- cudf_polars/dsl/expressions/ternary.py +49 -0
- cudf_polars/dsl/expressions/unary.py +517 -0
- cudf_polars/dsl/ir.py +2607 -0
- cudf_polars/dsl/nodebase.py +164 -0
- cudf_polars/dsl/to_ast.py +359 -0
- cudf_polars/dsl/tracing.py +16 -0
- cudf_polars/dsl/translate.py +939 -0
- cudf_polars/dsl/traversal.py +224 -0
- cudf_polars/dsl/utils/__init__.py +8 -0
- cudf_polars/dsl/utils/aggregations.py +481 -0
- cudf_polars/dsl/utils/groupby.py +98 -0
- cudf_polars/dsl/utils/naming.py +34 -0
- cudf_polars/dsl/utils/replace.py +61 -0
- cudf_polars/dsl/utils/reshape.py +74 -0
- cudf_polars/dsl/utils/rolling.py +121 -0
- cudf_polars/dsl/utils/windows.py +192 -0
- cudf_polars/experimental/__init__.py +8 -0
- cudf_polars/experimental/base.py +386 -0
- cudf_polars/experimental/benchmarks/__init__.py +4 -0
- cudf_polars/experimental/benchmarks/pdsds.py +220 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/__init__.py +4 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q1.py +88 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q10.py +225 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q2.py +244 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q3.py +65 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q4.py +359 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q5.py +462 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q6.py +92 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q7.py +79 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q8.py +524 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q9.py +137 -0
- cudf_polars/experimental/benchmarks/pdsh.py +814 -0
- cudf_polars/experimental/benchmarks/utils.py +832 -0
- cudf_polars/experimental/dask_registers.py +200 -0
- cudf_polars/experimental/dispatch.py +156 -0
- cudf_polars/experimental/distinct.py +197 -0
- cudf_polars/experimental/explain.py +157 -0
- cudf_polars/experimental/expressions.py +590 -0
- cudf_polars/experimental/groupby.py +327 -0
- cudf_polars/experimental/io.py +943 -0
- cudf_polars/experimental/join.py +391 -0
- cudf_polars/experimental/parallel.py +423 -0
- cudf_polars/experimental/repartition.py +69 -0
- cudf_polars/experimental/scheduler.py +155 -0
- cudf_polars/experimental/select.py +188 -0
- cudf_polars/experimental/shuffle.py +354 -0
- cudf_polars/experimental/sort.py +609 -0
- cudf_polars/experimental/spilling.py +151 -0
- cudf_polars/experimental/statistics.py +795 -0
- cudf_polars/experimental/utils.py +169 -0
- cudf_polars/py.typed +0 -0
- cudf_polars/testing/__init__.py +8 -0
- cudf_polars/testing/asserts.py +448 -0
- cudf_polars/testing/io.py +122 -0
- cudf_polars/testing/plugin.py +236 -0
- cudf_polars/typing/__init__.py +219 -0
- cudf_polars/utils/__init__.py +8 -0
- cudf_polars/utils/config.py +741 -0
- cudf_polars/utils/conversion.py +40 -0
- cudf_polars/utils/dtypes.py +118 -0
- cudf_polars/utils/sorting.py +53 -0
- cudf_polars/utils/timer.py +39 -0
- cudf_polars/utils/versions.py +27 -0
- cudf_polars_cu13-25.10.0.dist-info/METADATA +136 -0
- cudf_polars_cu13-25.10.0.dist-info/RECORD +92 -0
- cudf_polars_cu13-25.10.0.dist-info/WHEEL +5 -0
- cudf_polars_cu13-25.10.0.dist-info/licenses/LICENSE +201 -0
- cudf_polars_cu13-25.10.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,741 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
Configuration utilities for the cudf-polars engine.
|
|
6
|
+
|
|
7
|
+
Most users will not construct these objects directly. Instead, you'll pass
|
|
8
|
+
keyword arguments to :class:`~polars.lazyframe.engine_config.GPUEngine`. The
|
|
9
|
+
majority of the options are passed as `**kwargs` and collected into the
|
|
10
|
+
configuration described below:
|
|
11
|
+
|
|
12
|
+
.. code-block:: python
|
|
13
|
+
|
|
14
|
+
>>> import polars as pl
|
|
15
|
+
>>> engine = pl.GPUEngine(
|
|
16
|
+
... executor="streaming",
|
|
17
|
+
... executor_options={"fallback_mode": "raise"}
|
|
18
|
+
... )
|
|
19
|
+
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import dataclasses
|
|
25
|
+
import enum
|
|
26
|
+
import functools
|
|
27
|
+
import importlib.util
|
|
28
|
+
import json
|
|
29
|
+
import os
|
|
30
|
+
import warnings
|
|
31
|
+
from typing import TYPE_CHECKING, Literal, TypeVar
|
|
32
|
+
|
|
33
|
+
if TYPE_CHECKING:
|
|
34
|
+
from collections.abc import Callable
|
|
35
|
+
|
|
36
|
+
from typing_extensions import Self
|
|
37
|
+
|
|
38
|
+
import polars.lazyframe.engine_config
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
__all__ = [
|
|
42
|
+
"ConfigOptions",
|
|
43
|
+
"InMemoryExecutor",
|
|
44
|
+
"ParquetOptions",
|
|
45
|
+
"Scheduler",
|
|
46
|
+
"ShuffleMethod",
|
|
47
|
+
"StatsPlanningOptions",
|
|
48
|
+
"StreamingExecutor",
|
|
49
|
+
"StreamingFallbackMode",
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _env_get_int(name: str, default: int) -> int:
|
|
54
|
+
try:
|
|
55
|
+
return int(os.getenv(name, default))
|
|
56
|
+
except (ValueError, TypeError): # pragma: no cover
|
|
57
|
+
return default # pragma: no cover
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def get_total_device_memory() -> int | None:
|
|
61
|
+
"""Return the total memory of the current device."""
|
|
62
|
+
import pynvml
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
pynvml.nvmlInit()
|
|
66
|
+
index = os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")[0]
|
|
67
|
+
if index and not index.isnumeric(): # pragma: no cover
|
|
68
|
+
# This means device_index is UUID.
|
|
69
|
+
# This works for both MIG and non-MIG device UUIDs.
|
|
70
|
+
handle = pynvml.nvmlDeviceGetHandleByUUID(str.encode(index))
|
|
71
|
+
if pynvml.nvmlDeviceIsMigDeviceHandle(handle):
|
|
72
|
+
# Additionally get parent device handle
|
|
73
|
+
# if the device itself is a MIG instance
|
|
74
|
+
handle = pynvml.nvmlDeviceGetDeviceHandleFromMigDeviceHandle(handle)
|
|
75
|
+
else:
|
|
76
|
+
handle = pynvml.nvmlDeviceGetHandleByIndex(int(index))
|
|
77
|
+
|
|
78
|
+
return pynvml.nvmlDeviceGetMemoryInfo(handle).total
|
|
79
|
+
|
|
80
|
+
except pynvml.NVMLError_NotSupported: # pragma: no cover
|
|
81
|
+
# System doesn't have proper "GPU memory".
|
|
82
|
+
return None
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@functools.cache
|
|
86
|
+
def rapidsmpf_single_available() -> bool: # pragma: no cover
|
|
87
|
+
"""Query whether rapidsmpf is available as a single-process shuffle method."""
|
|
88
|
+
try:
|
|
89
|
+
return importlib.util.find_spec("rapidsmpf.integrations.single") is not None
|
|
90
|
+
except (ImportError, ValueError):
|
|
91
|
+
return False
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@functools.cache
|
|
95
|
+
def rapidsmpf_distributed_available() -> bool: # pragma: no cover
|
|
96
|
+
"""Query whether rapidsmpf is available as a distributed shuffle method."""
|
|
97
|
+
try:
|
|
98
|
+
return importlib.util.find_spec("rapidsmpf.integrations.dask") is not None
|
|
99
|
+
except (ImportError, ValueError):
|
|
100
|
+
return False
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
# TODO: Use enum.StrEnum when we drop Python 3.10
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class StreamingFallbackMode(str, enum.Enum):
|
|
107
|
+
"""
|
|
108
|
+
How the streaming executor handles operations that don't support multiple partitions.
|
|
109
|
+
|
|
110
|
+
Upon encountering an unsupported operation, the streaming executor will fall
|
|
111
|
+
back to using a single partition, which might use a large amount of memory.
|
|
112
|
+
|
|
113
|
+
* ``StreamingFallbackMode.WARN`` : Emit a warning and fall back to a single partition.
|
|
114
|
+
* ``StreamingFallbackMode.SILENT``: Silently fall back to a single partition.
|
|
115
|
+
* ``StreamingFallbackMode.RAISE`` : Raise an exception.
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
WARN = "warn"
|
|
119
|
+
RAISE = "raise"
|
|
120
|
+
SILENT = "silent"
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class Scheduler(str, enum.Enum):
|
|
124
|
+
"""
|
|
125
|
+
The scheduler to use for the streaming executor.
|
|
126
|
+
|
|
127
|
+
* ``Scheduler.SYNCHRONOUS`` : A zero-dependency, synchronous,
|
|
128
|
+
single-threaded scheduler.
|
|
129
|
+
* ``Scheduler.DISTRIBUTED`` : A Dask-based distributed scheduler.
|
|
130
|
+
Using this scheduler requires an active Dask cluster.
|
|
131
|
+
"""
|
|
132
|
+
|
|
133
|
+
SYNCHRONOUS = "synchronous"
|
|
134
|
+
DISTRIBUTED = "distributed"
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
class ShuffleMethod(str, enum.Enum):
|
|
138
|
+
"""
|
|
139
|
+
The method to use for shuffling data between workers with the streaming executor.
|
|
140
|
+
|
|
141
|
+
* ``ShuffleMethod.TASKS`` : Use the task-based shuffler.
|
|
142
|
+
* ``ShuffleMethod.RAPIDSMPF`` : Use the rapidsmpf shuffler.
|
|
143
|
+
* ``ShuffleMethod._RAPIDSMPF_SINGLE`` : Use the single-process rapidsmpf shuffler.
|
|
144
|
+
|
|
145
|
+
With :class:`cudf_polars.utils.config.StreamingExecutor`, the default of ``None``
|
|
146
|
+
will attempt to use ``ShuffleMethod.RAPIDSMPF`` for the distributed scheduler,
|
|
147
|
+
but will fall back to ``ShuffleMethod.TASKS`` if rapidsmpf is not installed.
|
|
148
|
+
|
|
149
|
+
The user should **not** specify ``ShuffleMethod._RAPIDSMPF_SINGLE`` directly.
|
|
150
|
+
A setting of ``ShuffleMethod.RAPIDSMPF`` will be converted to the single-process
|
|
151
|
+
shuffler automatically when the 'synchronous' scheduler is active.
|
|
152
|
+
"""
|
|
153
|
+
|
|
154
|
+
TASKS = "tasks"
|
|
155
|
+
RAPIDSMPF = "rapidsmpf"
|
|
156
|
+
_RAPIDSMPF_SINGLE = "rapidsmpf-single"
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
T = TypeVar("T")
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _make_default_factory(
|
|
163
|
+
key: str, converter: Callable[[str], T], *, default: T
|
|
164
|
+
) -> Callable[[], T]:
|
|
165
|
+
def default_factory() -> T:
|
|
166
|
+
v = os.environ.get(key)
|
|
167
|
+
if v is None:
|
|
168
|
+
return default
|
|
169
|
+
return converter(v)
|
|
170
|
+
|
|
171
|
+
return default_factory
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _bool_converter(v: str) -> bool:
|
|
175
|
+
lowered = v.lower()
|
|
176
|
+
if lowered in {"1", "true", "yes", "y"}:
|
|
177
|
+
return True
|
|
178
|
+
elif lowered in {"0", "false", "no", "n"}:
|
|
179
|
+
return False
|
|
180
|
+
else:
|
|
181
|
+
raise ValueError(f"Invalid boolean value: '{v}'")
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
@dataclasses.dataclass(frozen=True)
|
|
185
|
+
class ParquetOptions:
|
|
186
|
+
"""
|
|
187
|
+
Configuration for the cudf-polars Parquet engine.
|
|
188
|
+
|
|
189
|
+
These options can be configured via environment variables
|
|
190
|
+
with the prefix ``CUDF_POLARS__PARQUET_OPTIONS__``.
|
|
191
|
+
|
|
192
|
+
Parameters
|
|
193
|
+
----------
|
|
194
|
+
chunked
|
|
195
|
+
Whether to use libcudf's ``ChunkedParquetReader`` or ``ChunkedParquetWriter``
|
|
196
|
+
to read/write the parquet dataset in chunks. This is useful when reading/writing
|
|
197
|
+
very large parquet files.
|
|
198
|
+
n_output_chunks
|
|
199
|
+
Split the dataframe in ``n_output_chunks`` when using libcudf's ``ChunkedParquetWriter``.
|
|
200
|
+
chunk_read_limit
|
|
201
|
+
Limit on total number of bytes to be returned per read, or 0 if
|
|
202
|
+
there is no limit.
|
|
203
|
+
pass_read_limit
|
|
204
|
+
Limit on the amount of memory used for reading and decompressing data
|
|
205
|
+
or 0 if there is no limit.
|
|
206
|
+
max_footer_samples
|
|
207
|
+
Maximum number of file footers to sample for metadata. This
|
|
208
|
+
option is currently used by the streaming executor to gather
|
|
209
|
+
datasource statistics before generating a physical plan. Set to
|
|
210
|
+
0 to avoid metadata sampling. Default is 3.
|
|
211
|
+
max_row_group_samples
|
|
212
|
+
Maximum number of row-groups to sample for unique-value statistics.
|
|
213
|
+
This option may be used by the streaming executor to optimize
|
|
214
|
+
the physical plan. Default is 1.
|
|
215
|
+
|
|
216
|
+
Set to 0 to avoid row-group sampling. Note that row-group sampling
|
|
217
|
+
will also be skipped if ``max_footer_samples`` is 0.
|
|
218
|
+
"""
|
|
219
|
+
|
|
220
|
+
_env_prefix = "CUDF_POLARS__PARQUET_OPTIONS"
|
|
221
|
+
|
|
222
|
+
chunked: bool = dataclasses.field(
|
|
223
|
+
default_factory=_make_default_factory(
|
|
224
|
+
f"{_env_prefix}__CHUNKED", _bool_converter, default=True
|
|
225
|
+
)
|
|
226
|
+
)
|
|
227
|
+
n_output_chunks: int = dataclasses.field(
|
|
228
|
+
default_factory=_make_default_factory(
|
|
229
|
+
f"{_env_prefix}__N_OUTPUT_CHUNKS", int, default=1
|
|
230
|
+
)
|
|
231
|
+
)
|
|
232
|
+
chunk_read_limit: int = dataclasses.field(
|
|
233
|
+
default_factory=_make_default_factory(
|
|
234
|
+
f"{_env_prefix}__CHUNK_READ_LIMIT", int, default=0
|
|
235
|
+
)
|
|
236
|
+
)
|
|
237
|
+
pass_read_limit: int = dataclasses.field(
|
|
238
|
+
default_factory=_make_default_factory(
|
|
239
|
+
f"{_env_prefix}__PASS_READ_LIMIT", int, default=0
|
|
240
|
+
)
|
|
241
|
+
)
|
|
242
|
+
max_footer_samples: int = dataclasses.field(
|
|
243
|
+
default_factory=_make_default_factory(
|
|
244
|
+
f"{_env_prefix}__MAX_FOOTER_SAMPLES", int, default=3
|
|
245
|
+
)
|
|
246
|
+
)
|
|
247
|
+
max_row_group_samples: int = dataclasses.field(
|
|
248
|
+
default_factory=_make_default_factory(
|
|
249
|
+
f"{_env_prefix}__MAX_ROW_GROUP_SAMPLES", int, default=1
|
|
250
|
+
)
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
def __post_init__(self) -> None: # noqa: D105
|
|
254
|
+
if not isinstance(self.chunked, bool):
|
|
255
|
+
raise TypeError("chunked must be a bool")
|
|
256
|
+
if not isinstance(self.n_output_chunks, int):
|
|
257
|
+
raise TypeError("n_output_chunks must be an int")
|
|
258
|
+
if not isinstance(self.chunk_read_limit, int):
|
|
259
|
+
raise TypeError("chunk_read_limit must be an int")
|
|
260
|
+
if not isinstance(self.pass_read_limit, int):
|
|
261
|
+
raise TypeError("pass_read_limit must be an int")
|
|
262
|
+
if not isinstance(self.max_footer_samples, int):
|
|
263
|
+
raise TypeError("max_footer_samples must be an int")
|
|
264
|
+
if not isinstance(self.max_row_group_samples, int):
|
|
265
|
+
raise TypeError("max_row_group_samples must be an int")
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def default_blocksize(scheduler: str) -> int:
|
|
269
|
+
"""Return the default blocksize."""
|
|
270
|
+
device_size = get_total_device_memory()
|
|
271
|
+
if device_size is None: # pragma: no cover
|
|
272
|
+
# System doesn't have proper "GPU memory".
|
|
273
|
+
# Fall back to a conservative 1GB default.
|
|
274
|
+
return 1_000_000_000
|
|
275
|
+
|
|
276
|
+
if (
|
|
277
|
+
scheduler == "distributed"
|
|
278
|
+
or _env_get_int("POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY", default=1) == 0
|
|
279
|
+
):
|
|
280
|
+
# Distributed execution requires a conservative
|
|
281
|
+
# blocksize for now. We are also more conservative
|
|
282
|
+
# when UVM is disabled.
|
|
283
|
+
blocksize = int(device_size * 0.025)
|
|
284
|
+
else:
|
|
285
|
+
# Single-GPU execution can lean on UVM to
|
|
286
|
+
# support a much larger blocksize.
|
|
287
|
+
blocksize = int(device_size * 0.0625)
|
|
288
|
+
|
|
289
|
+
# Use lower and upper bounds of 1GB and 10GB
|
|
290
|
+
return min(max(blocksize, 1_000_000_000), 10_000_000_000)
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
@dataclasses.dataclass(frozen=True)
|
|
294
|
+
class StatsPlanningOptions:
|
|
295
|
+
"""
|
|
296
|
+
Configuration for statistics-based query planning.
|
|
297
|
+
|
|
298
|
+
These options can be configured via environment variables
|
|
299
|
+
with the prefix ``CUDF_POLARS__EXECUTOR__STATS_PLANNING__``.
|
|
300
|
+
|
|
301
|
+
Parameters
|
|
302
|
+
----------
|
|
303
|
+
use_io_partitioning
|
|
304
|
+
Whether to use estimated file-size statistics to calculate
|
|
305
|
+
the ideal input-partition count for IO operations.
|
|
306
|
+
This option currently applies to Parquet data only.
|
|
307
|
+
Default is True.
|
|
308
|
+
use_reduction_planning
|
|
309
|
+
Whether to use estimated column statistics to calculate
|
|
310
|
+
the output-partition count for reduction operations
|
|
311
|
+
like `Distinct`, `GroupBy`, and `Select(unique)`.
|
|
312
|
+
Default is False.
|
|
313
|
+
use_join_heuristics
|
|
314
|
+
Whether to use join heuristics to estimate row-count
|
|
315
|
+
and unique-count statistics. Default is True.
|
|
316
|
+
These statistics may only be collected when they are
|
|
317
|
+
actually needed for query planning and when row-count
|
|
318
|
+
statistics are available for the underlying datasource
|
|
319
|
+
(e.g. Parquet and in-memory LazyFrame data).
|
|
320
|
+
use_sampling
|
|
321
|
+
Whether to sample real data to estimate unique-value
|
|
322
|
+
statistics. Default is True.
|
|
323
|
+
These statistics may only be collected when they are
|
|
324
|
+
actually needed for query planning, and when the
|
|
325
|
+
underlying datasource supports sampling (e.g. Parquet
|
|
326
|
+
and in-memory LazyFrame data).
|
|
327
|
+
default_selectivity
|
|
328
|
+
The default selectivity of a predicate.
|
|
329
|
+
Default is 0.8.
|
|
330
|
+
"""
|
|
331
|
+
|
|
332
|
+
_env_prefix = "CUDF_POLARS__EXECUTOR__STATS_PLANNING"
|
|
333
|
+
|
|
334
|
+
use_io_partitioning: bool = dataclasses.field(
|
|
335
|
+
default_factory=_make_default_factory(
|
|
336
|
+
f"{_env_prefix}__USE_IO_PARTITIONING", _bool_converter, default=True
|
|
337
|
+
)
|
|
338
|
+
)
|
|
339
|
+
use_reduction_planning: bool = dataclasses.field(
|
|
340
|
+
default_factory=_make_default_factory(
|
|
341
|
+
f"{_env_prefix}__USE_REDUCTION_PLANNING", _bool_converter, default=False
|
|
342
|
+
)
|
|
343
|
+
)
|
|
344
|
+
use_join_heuristics: bool = dataclasses.field(
|
|
345
|
+
default_factory=_make_default_factory(
|
|
346
|
+
f"{_env_prefix}__USE_JOIN_HEURISTICS", _bool_converter, default=True
|
|
347
|
+
)
|
|
348
|
+
)
|
|
349
|
+
use_sampling: bool = dataclasses.field(
|
|
350
|
+
default_factory=_make_default_factory(
|
|
351
|
+
f"{_env_prefix}__USE_SAMPLING", _bool_converter, default=True
|
|
352
|
+
)
|
|
353
|
+
)
|
|
354
|
+
default_selectivity: float = dataclasses.field(
|
|
355
|
+
default_factory=_make_default_factory(
|
|
356
|
+
f"{_env_prefix}__DEFAULT_SELECTIVITY", float, default=0.8
|
|
357
|
+
)
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
def __post_init__(self) -> None: # noqa: D105
|
|
361
|
+
if not isinstance(self.use_io_partitioning, bool):
|
|
362
|
+
raise TypeError("use_io_partitioning must be a bool")
|
|
363
|
+
if not isinstance(self.use_reduction_planning, bool):
|
|
364
|
+
raise TypeError("use_reduction_planning must be a bool")
|
|
365
|
+
if not isinstance(self.use_join_heuristics, bool):
|
|
366
|
+
raise TypeError("use_join_heuristics must be a bool")
|
|
367
|
+
if not isinstance(self.use_sampling, bool):
|
|
368
|
+
raise TypeError("use_sampling must be a bool")
|
|
369
|
+
if not isinstance(self.default_selectivity, float):
|
|
370
|
+
raise TypeError("default_selectivity must be a float")
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
@dataclasses.dataclass(frozen=True, eq=True)
|
|
374
|
+
class StreamingExecutor:
|
|
375
|
+
"""
|
|
376
|
+
Configuration for the cudf-polars streaming executor.
|
|
377
|
+
|
|
378
|
+
These options can be configured via environment variables
|
|
379
|
+
with the prefix ``CUDF_POLARS__EXECUTOR__``.
|
|
380
|
+
|
|
381
|
+
Parameters
|
|
382
|
+
----------
|
|
383
|
+
scheduler
|
|
384
|
+
The scheduler to use for the streaming executor. ``Scheduler.SYNCHRONOUS``
|
|
385
|
+
by default.
|
|
386
|
+
|
|
387
|
+
Note ``scheduler="distributed"`` requires a Dask cluster to be running.
|
|
388
|
+
fallback_mode
|
|
389
|
+
How to handle errors when the GPU engine fails to execute a query.
|
|
390
|
+
``StreamingFallbackMode.WARN`` by default.
|
|
391
|
+
|
|
392
|
+
This can be set using the ``CUDF_POLARS__EXECUTOR__FALLBACK_MODE``
|
|
393
|
+
environment variable.
|
|
394
|
+
max_rows_per_partition
|
|
395
|
+
The maximum number of rows to process per partition. 1_000_000 by default.
|
|
396
|
+
When the number of rows exceeds this value, the query will be split into
|
|
397
|
+
multiple partitions and executed in parallel.
|
|
398
|
+
unique_fraction
|
|
399
|
+
A dictionary mapping column names to floats between 0 and 1 (inclusive
|
|
400
|
+
on the right).
|
|
401
|
+
|
|
402
|
+
Each factor estimates the fractional number of unique values in the
|
|
403
|
+
column. By default, ``1.0`` is used for any column not included in
|
|
404
|
+
``unique_fraction``.
|
|
405
|
+
target_partition_size
|
|
406
|
+
Target partition size, in bytes, for IO tasks. This configuration currently
|
|
407
|
+
controls how large parquet files are split into multiple partitions.
|
|
408
|
+
Files larger than ``target_partition_size`` bytes are split into multiple
|
|
409
|
+
partitions.
|
|
410
|
+
|
|
411
|
+
This can be set via
|
|
412
|
+
|
|
413
|
+
- keyword argument to ``polars.GPUEngine``
|
|
414
|
+
- the ``CUDF_POLARS__EXECUTOR__TARGET_PARTITION_SIZE`` environment variable
|
|
415
|
+
|
|
416
|
+
By default, cudf-polars uses a target partition size that's a fraction
|
|
417
|
+
of the device memory, where the fraction depends on the scheduler:
|
|
418
|
+
|
|
419
|
+
- distributed: 1/40th of the device memory
|
|
420
|
+
- synchronous: 1/16th of the device memory
|
|
421
|
+
|
|
422
|
+
The optional pynvml dependency is used to query the device memory size. If
|
|
423
|
+
pynvml is not available, a warning is emitted and the device size is assumed
|
|
424
|
+
to be 12 GiB.
|
|
425
|
+
|
|
426
|
+
groupby_n_ary
|
|
427
|
+
The factor by which the number of partitions is decreased when performing
|
|
428
|
+
a groupby on a partitioned column. For example, if a column has 64 partitions,
|
|
429
|
+
it will first be reduced to ``ceil(64 / 32) = 2`` partitions.
|
|
430
|
+
|
|
431
|
+
This is useful when the absolute number of partitions is large.
|
|
432
|
+
broadcast_join_limit
|
|
433
|
+
The maximum number of partitions to allow for the smaller table in
|
|
434
|
+
a broadcast join.
|
|
435
|
+
shuffle_method
|
|
436
|
+
The method to use for shuffling data between workers. Defaults to
|
|
437
|
+
'rapidsmpf' for distributed scheduler if available (otherwise 'tasks'),
|
|
438
|
+
and 'tasks' for synchronous scheduler.
|
|
439
|
+
rapidsmpf_spill
|
|
440
|
+
Whether to wrap task arguments and output in objects that are
|
|
441
|
+
spillable by 'rapidsmpf'.
|
|
442
|
+
sink_to_directory
|
|
443
|
+
Whether multi-partition sink operations should write to a directory
|
|
444
|
+
rather than a single file. By default, this will be set to True for
|
|
445
|
+
the 'distributed' scheduler and False otherwise. The 'distrubuted'
|
|
446
|
+
scheduler does not currently support ``sink_to_directory=False``.
|
|
447
|
+
stats_planning
|
|
448
|
+
Options controlling statistics-based query planning. See
|
|
449
|
+
:class:`~cudf_polars.utils.config.StatsPlanningOptions` for more.
|
|
450
|
+
|
|
451
|
+
Notes
|
|
452
|
+
-----
|
|
453
|
+
The streaming executor does not currently support profiling a query via
|
|
454
|
+
the ``.profile()`` method. We recommend using nsys to profile queries
|
|
455
|
+
with the 'synchronous' scheduler and Dask's built-in profiling tools
|
|
456
|
+
with the 'distributed' scheduler.
|
|
457
|
+
"""
|
|
458
|
+
|
|
459
|
+
_env_prefix = "CUDF_POLARS__EXECUTOR"
|
|
460
|
+
|
|
461
|
+
name: Literal["streaming"] = dataclasses.field(default="streaming", init=False)
|
|
462
|
+
scheduler: Scheduler = dataclasses.field(
|
|
463
|
+
default_factory=_make_default_factory(
|
|
464
|
+
f"{_env_prefix}__SCHEDULER",
|
|
465
|
+
Scheduler.__call__,
|
|
466
|
+
default=Scheduler.SYNCHRONOUS,
|
|
467
|
+
)
|
|
468
|
+
)
|
|
469
|
+
fallback_mode: StreamingFallbackMode = dataclasses.field(
|
|
470
|
+
default_factory=_make_default_factory(
|
|
471
|
+
f"{_env_prefix}__FALLBACK_MODE",
|
|
472
|
+
StreamingFallbackMode.__call__,
|
|
473
|
+
default=StreamingFallbackMode.WARN,
|
|
474
|
+
)
|
|
475
|
+
)
|
|
476
|
+
max_rows_per_partition: int = dataclasses.field(
|
|
477
|
+
default_factory=_make_default_factory(
|
|
478
|
+
f"{_env_prefix}__MAX_ROWS_PER_PARTITION", int, default=1_000_000
|
|
479
|
+
)
|
|
480
|
+
)
|
|
481
|
+
unique_fraction: dict[str, float] = dataclasses.field(
|
|
482
|
+
default_factory=_make_default_factory(
|
|
483
|
+
f"{_env_prefix}__UNIQUE_FRACTION", json.loads, default={}
|
|
484
|
+
)
|
|
485
|
+
)
|
|
486
|
+
target_partition_size: int = dataclasses.field(
|
|
487
|
+
default_factory=_make_default_factory(
|
|
488
|
+
f"{_env_prefix}__TARGET_PARTITION_SIZE", int, default=0
|
|
489
|
+
)
|
|
490
|
+
)
|
|
491
|
+
groupby_n_ary: int = dataclasses.field(
|
|
492
|
+
default_factory=_make_default_factory(
|
|
493
|
+
f"{_env_prefix}__GROUPBY_N_ARY", int, default=32
|
|
494
|
+
)
|
|
495
|
+
)
|
|
496
|
+
broadcast_join_limit: int = dataclasses.field(
|
|
497
|
+
default_factory=_make_default_factory(
|
|
498
|
+
f"{_env_prefix}__BROADCAST_JOIN_LIMIT", int, default=0
|
|
499
|
+
)
|
|
500
|
+
)
|
|
501
|
+
shuffle_method: ShuffleMethod = dataclasses.field(
|
|
502
|
+
default_factory=_make_default_factory(
|
|
503
|
+
f"{_env_prefix}__SHUFFLE_METHOD",
|
|
504
|
+
ShuffleMethod.__call__,
|
|
505
|
+
default=ShuffleMethod.TASKS,
|
|
506
|
+
)
|
|
507
|
+
)
|
|
508
|
+
rapidsmpf_spill: bool = dataclasses.field(
|
|
509
|
+
default_factory=_make_default_factory(
|
|
510
|
+
f"{_env_prefix}__RAPIDSMPF_SPILL", _bool_converter, default=False
|
|
511
|
+
)
|
|
512
|
+
)
|
|
513
|
+
sink_to_directory: bool | None = dataclasses.field(
|
|
514
|
+
default_factory=_make_default_factory(
|
|
515
|
+
f"{_env_prefix}__SINK_TO_DIRECTORY", _bool_converter, default=None
|
|
516
|
+
)
|
|
517
|
+
)
|
|
518
|
+
stats_planning: StatsPlanningOptions = dataclasses.field(
|
|
519
|
+
default_factory=StatsPlanningOptions
|
|
520
|
+
)
|
|
521
|
+
|
|
522
|
+
def __post_init__(self) -> None: # noqa: D105
|
|
523
|
+
# Handle shuffle_method defaults for streaming executor
|
|
524
|
+
if self.shuffle_method is None:
|
|
525
|
+
if self.scheduler == "distributed" and rapidsmpf_distributed_available():
|
|
526
|
+
# For distributed scheduler, prefer rapidsmpf if available
|
|
527
|
+
object.__setattr__(self, "shuffle_method", "rapidsmpf")
|
|
528
|
+
else:
|
|
529
|
+
# Otherwise, use task-based shuffle for now.
|
|
530
|
+
# TODO: Evaluate single-process shuffle by default.
|
|
531
|
+
object.__setattr__(self, "shuffle_method", "tasks")
|
|
532
|
+
elif self.shuffle_method == "rapidsmpf-single":
|
|
533
|
+
# The user should NOT specify "rapidsmpf-single" directly.
|
|
534
|
+
raise ValueError("rapidsmpf-single is not a supported shuffle method.")
|
|
535
|
+
elif self.shuffle_method == "rapidsmpf":
|
|
536
|
+
# Check that we have rapidsmpf installed
|
|
537
|
+
if (
|
|
538
|
+
self.scheduler == "distributed"
|
|
539
|
+
and not rapidsmpf_distributed_available()
|
|
540
|
+
):
|
|
541
|
+
raise ValueError(
|
|
542
|
+
"rapidsmpf shuffle method requested, but rapidsmpf.integrations.dask is not installed."
|
|
543
|
+
)
|
|
544
|
+
elif self.scheduler == "synchronous" and not rapidsmpf_single_available():
|
|
545
|
+
raise ValueError(
|
|
546
|
+
"rapidsmpf shuffle method requested, but rapidsmpf is not installed."
|
|
547
|
+
)
|
|
548
|
+
# Select "rapidsmpf-single" for the synchronous
|
|
549
|
+
if self.scheduler == "synchronous":
|
|
550
|
+
object.__setattr__(self, "shuffle_method", "rapidsmpf-single")
|
|
551
|
+
|
|
552
|
+
# frozen dataclass, so use object.__setattr__
|
|
553
|
+
object.__setattr__(
|
|
554
|
+
self, "fallback_mode", StreamingFallbackMode(self.fallback_mode)
|
|
555
|
+
)
|
|
556
|
+
if self.target_partition_size == 0:
|
|
557
|
+
object.__setattr__(
|
|
558
|
+
self, "target_partition_size", default_blocksize(self.scheduler)
|
|
559
|
+
)
|
|
560
|
+
if self.broadcast_join_limit == 0:
|
|
561
|
+
object.__setattr__(
|
|
562
|
+
self,
|
|
563
|
+
"broadcast_join_limit",
|
|
564
|
+
# Usually better to avoid shuffling for single gpu
|
|
565
|
+
2 if self.scheduler == "distributed" else 32,
|
|
566
|
+
)
|
|
567
|
+
object.__setattr__(self, "scheduler", Scheduler(self.scheduler))
|
|
568
|
+
object.__setattr__(self, "shuffle_method", ShuffleMethod(self.shuffle_method))
|
|
569
|
+
|
|
570
|
+
# Make sure stats_planning is a dataclass
|
|
571
|
+
if isinstance(self.stats_planning, dict):
|
|
572
|
+
object.__setattr__(
|
|
573
|
+
self,
|
|
574
|
+
"stats_planning",
|
|
575
|
+
StatsPlanningOptions(**self.stats_planning),
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
if self.scheduler == "distributed":
|
|
579
|
+
if self.sink_to_directory is False:
|
|
580
|
+
raise ValueError(
|
|
581
|
+
"The distributed scheduler requires sink_to_directory=True"
|
|
582
|
+
)
|
|
583
|
+
object.__setattr__(self, "sink_to_directory", True)
|
|
584
|
+
elif self.sink_to_directory is None:
|
|
585
|
+
object.__setattr__(self, "sink_to_directory", False)
|
|
586
|
+
|
|
587
|
+
# Type / value check everything else
|
|
588
|
+
if not isinstance(self.max_rows_per_partition, int):
|
|
589
|
+
raise TypeError("max_rows_per_partition must be an int")
|
|
590
|
+
if not isinstance(self.unique_fraction, dict):
|
|
591
|
+
raise TypeError("unique_fraction must be a dict of column name to float")
|
|
592
|
+
if not isinstance(self.target_partition_size, int):
|
|
593
|
+
raise TypeError("target_partition_size must be an int")
|
|
594
|
+
if not isinstance(self.groupby_n_ary, int):
|
|
595
|
+
raise TypeError("groupby_n_ary must be an int")
|
|
596
|
+
if not isinstance(self.broadcast_join_limit, int):
|
|
597
|
+
raise TypeError("broadcast_join_limit must be an int")
|
|
598
|
+
if not isinstance(self.rapidsmpf_spill, bool):
|
|
599
|
+
raise TypeError("rapidsmpf_spill must be bool")
|
|
600
|
+
if not isinstance(self.sink_to_directory, bool):
|
|
601
|
+
raise TypeError("sink_to_directory must be bool")
|
|
602
|
+
|
|
603
|
+
# RapidsMPF spill is only supported for the distributed scheduler for now.
|
|
604
|
+
# This is because the spilling API is still within the RMPF-Dask integration.
|
|
605
|
+
# (See https://github.com/rapidsai/rapidsmpf/issues/439)
|
|
606
|
+
if self.scheduler == "synchronous" and self.rapidsmpf_spill: # pragma: no cover
|
|
607
|
+
raise ValueError(
|
|
608
|
+
"rapidsmpf_spill is not supported for the synchronous scheduler."
|
|
609
|
+
)
|
|
610
|
+
|
|
611
|
+
def __hash__(self) -> int: # noqa: D105
|
|
612
|
+
# cardinality factory, a dict, isn't natively hashable. We'll dump it
|
|
613
|
+
# to json and hash that.
|
|
614
|
+
d = dataclasses.asdict(self)
|
|
615
|
+
d["unique_fraction"] = json.dumps(d["unique_fraction"])
|
|
616
|
+
d["stats_planning"] = json.dumps(d["stats_planning"])
|
|
617
|
+
return hash(tuple(sorted(d.items())))
|
|
618
|
+
|
|
619
|
+
|
|
620
|
+
@dataclasses.dataclass(frozen=True, eq=True)
|
|
621
|
+
class InMemoryExecutor:
|
|
622
|
+
"""
|
|
623
|
+
Configuration for the cudf-polars in-memory executor.
|
|
624
|
+
|
|
625
|
+
Parameters
|
|
626
|
+
----------
|
|
627
|
+
scheduler:
|
|
628
|
+
The scheduler to use for the in-memory executor. Currently
|
|
629
|
+
only ``Scheduler.SYNCHRONOUS`` is supported for the in-memory executor.
|
|
630
|
+
"""
|
|
631
|
+
|
|
632
|
+
name: Literal["in-memory"] = dataclasses.field(default="in-memory", init=False)
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
@dataclasses.dataclass(frozen=True, eq=True)
|
|
636
|
+
class ConfigOptions:
|
|
637
|
+
"""
|
|
638
|
+
Configuration for the polars GPUEngine.
|
|
639
|
+
|
|
640
|
+
Parameters
|
|
641
|
+
----------
|
|
642
|
+
raise_on_fail
|
|
643
|
+
Whether to raise an exception when the GPU engine cannot execute a
|
|
644
|
+
query. ``False`` by default.
|
|
645
|
+
parquet_options
|
|
646
|
+
Options controlling parquet file reading and writing. See
|
|
647
|
+
:class:`~cudf_polars.utils.config.ParquetOptions` for more.
|
|
648
|
+
executor
|
|
649
|
+
The executor to use for the GPU engine. See :class:`~cudf_polars.utils.config.StreamingExecutor`
|
|
650
|
+
and :class:`~cudf_polars.utils.config.InMemoryExecutor` for more.
|
|
651
|
+
device
|
|
652
|
+
The GPU used to run the query. If not provided, the
|
|
653
|
+
query uses the current CUDA device.
|
|
654
|
+
"""
|
|
655
|
+
|
|
656
|
+
raise_on_fail: bool = False
|
|
657
|
+
parquet_options: ParquetOptions = dataclasses.field(default_factory=ParquetOptions)
|
|
658
|
+
executor: StreamingExecutor | InMemoryExecutor = dataclasses.field(
|
|
659
|
+
default_factory=StreamingExecutor
|
|
660
|
+
)
|
|
661
|
+
device: int | None = None
|
|
662
|
+
|
|
663
|
+
@classmethod
|
|
664
|
+
def from_polars_engine(
|
|
665
|
+
cls, engine: polars.lazyframe.engine_config.GPUEngine
|
|
666
|
+
) -> Self:
|
|
667
|
+
"""Create a :class:`ConfigOptions` from a :class:`~polars.lazyframe.engine_config.GPUEngine`."""
|
|
668
|
+
# these are the valid top-level keys in the engine.config that
|
|
669
|
+
# the user passes as **kwargs to GPUEngine.
|
|
670
|
+
valid_options = {
|
|
671
|
+
"executor",
|
|
672
|
+
"executor_options",
|
|
673
|
+
"parquet_options",
|
|
674
|
+
"raise_on_fail",
|
|
675
|
+
}
|
|
676
|
+
|
|
677
|
+
extra_options = set(engine.config.keys()) - valid_options
|
|
678
|
+
if extra_options:
|
|
679
|
+
raise TypeError(f"Unsupported executor_options: {extra_options}")
|
|
680
|
+
|
|
681
|
+
env_prefix = "CUDF_POLARS"
|
|
682
|
+
user_executor = engine.config.get("executor")
|
|
683
|
+
if user_executor is None:
|
|
684
|
+
user_executor = os.environ.get(f"{env_prefix}__EXECUTOR", "streaming")
|
|
685
|
+
user_executor_options = engine.config.get("executor_options", {})
|
|
686
|
+
user_parquet_options = engine.config.get("parquet_options", {})
|
|
687
|
+
# This is set in polars, and so can't be overridden by the environment
|
|
688
|
+
user_raise_on_fail = engine.config.get("raise_on_fail", False)
|
|
689
|
+
|
|
690
|
+
# Backward compatibility for "cardinality_factor"
|
|
691
|
+
# TODO: Remove this in 25.10
|
|
692
|
+
if "cardinality_factor" in user_executor_options:
|
|
693
|
+
warnings.warn(
|
|
694
|
+
"The 'cardinality_factor' configuration is deprecated. "
|
|
695
|
+
"Please use 'unique_fraction' instead.",
|
|
696
|
+
FutureWarning,
|
|
697
|
+
stacklevel=2,
|
|
698
|
+
)
|
|
699
|
+
cardinality_factor = user_executor_options.pop("cardinality_factor")
|
|
700
|
+
if "unique_fraction" not in user_executor_options:
|
|
701
|
+
user_executor_options["unique_fraction"] = cardinality_factor
|
|
702
|
+
|
|
703
|
+
# These are user-provided options, so we need to actually validate
|
|
704
|
+
# them.
|
|
705
|
+
|
|
706
|
+
if user_executor not in {"in-memory", "streaming"}:
|
|
707
|
+
raise ValueError(f"Unknown executor '{user_executor}'")
|
|
708
|
+
|
|
709
|
+
if not isinstance(user_raise_on_fail, bool):
|
|
710
|
+
raise TypeError("GPUEngine option 'raise_on_fail' must be a boolean.")
|
|
711
|
+
|
|
712
|
+
executor: InMemoryExecutor | StreamingExecutor
|
|
713
|
+
|
|
714
|
+
match user_executor:
|
|
715
|
+
case "in-memory":
|
|
716
|
+
executor = InMemoryExecutor(**user_executor_options)
|
|
717
|
+
case "streaming":
|
|
718
|
+
user_executor_options = user_executor_options.copy()
|
|
719
|
+
# Handle the interaction between the default shuffle method, the
|
|
720
|
+
# scheduler, and whether rapidsmpf is available.
|
|
721
|
+
env_shuffle_method = os.environ.get(
|
|
722
|
+
"CUDF_POLARS__EXECUTOR__SHUFFLE_METHOD", None
|
|
723
|
+
)
|
|
724
|
+
if env_shuffle_method is not None:
|
|
725
|
+
shuffle_method_default = ShuffleMethod(env_shuffle_method)
|
|
726
|
+
else:
|
|
727
|
+
shuffle_method_default = None
|
|
728
|
+
|
|
729
|
+
user_executor_options.setdefault(
|
|
730
|
+
"shuffle_method", shuffle_method_default
|
|
731
|
+
)
|
|
732
|
+
executor = StreamingExecutor(**user_executor_options)
|
|
733
|
+
case _: # pragma: no cover; Unreachable
|
|
734
|
+
raise ValueError(f"Unsupported executor: {user_executor}")
|
|
735
|
+
|
|
736
|
+
return cls(
|
|
737
|
+
raise_on_fail=user_raise_on_fail,
|
|
738
|
+
parquet_options=ParquetOptions(**user_parquet_options),
|
|
739
|
+
executor=executor,
|
|
740
|
+
device=engine.device,
|
|
741
|
+
)
|