cudf-polars-cu13 25.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. cudf_polars/GIT_COMMIT +1 -0
  2. cudf_polars/VERSION +1 -0
  3. cudf_polars/__init__.py +28 -0
  4. cudf_polars/_version.py +21 -0
  5. cudf_polars/callback.py +318 -0
  6. cudf_polars/containers/__init__.py +13 -0
  7. cudf_polars/containers/column.py +495 -0
  8. cudf_polars/containers/dataframe.py +361 -0
  9. cudf_polars/containers/datatype.py +137 -0
  10. cudf_polars/dsl/__init__.py +8 -0
  11. cudf_polars/dsl/expr.py +66 -0
  12. cudf_polars/dsl/expressions/__init__.py +8 -0
  13. cudf_polars/dsl/expressions/aggregation.py +226 -0
  14. cudf_polars/dsl/expressions/base.py +272 -0
  15. cudf_polars/dsl/expressions/binaryop.py +120 -0
  16. cudf_polars/dsl/expressions/boolean.py +326 -0
  17. cudf_polars/dsl/expressions/datetime.py +271 -0
  18. cudf_polars/dsl/expressions/literal.py +97 -0
  19. cudf_polars/dsl/expressions/rolling.py +643 -0
  20. cudf_polars/dsl/expressions/selection.py +74 -0
  21. cudf_polars/dsl/expressions/slicing.py +46 -0
  22. cudf_polars/dsl/expressions/sorting.py +85 -0
  23. cudf_polars/dsl/expressions/string.py +1002 -0
  24. cudf_polars/dsl/expressions/struct.py +137 -0
  25. cudf_polars/dsl/expressions/ternary.py +49 -0
  26. cudf_polars/dsl/expressions/unary.py +517 -0
  27. cudf_polars/dsl/ir.py +2607 -0
  28. cudf_polars/dsl/nodebase.py +164 -0
  29. cudf_polars/dsl/to_ast.py +359 -0
  30. cudf_polars/dsl/tracing.py +16 -0
  31. cudf_polars/dsl/translate.py +939 -0
  32. cudf_polars/dsl/traversal.py +224 -0
  33. cudf_polars/dsl/utils/__init__.py +8 -0
  34. cudf_polars/dsl/utils/aggregations.py +481 -0
  35. cudf_polars/dsl/utils/groupby.py +98 -0
  36. cudf_polars/dsl/utils/naming.py +34 -0
  37. cudf_polars/dsl/utils/replace.py +61 -0
  38. cudf_polars/dsl/utils/reshape.py +74 -0
  39. cudf_polars/dsl/utils/rolling.py +121 -0
  40. cudf_polars/dsl/utils/windows.py +192 -0
  41. cudf_polars/experimental/__init__.py +8 -0
  42. cudf_polars/experimental/base.py +386 -0
  43. cudf_polars/experimental/benchmarks/__init__.py +4 -0
  44. cudf_polars/experimental/benchmarks/pdsds.py +220 -0
  45. cudf_polars/experimental/benchmarks/pdsds_queries/__init__.py +4 -0
  46. cudf_polars/experimental/benchmarks/pdsds_queries/q1.py +88 -0
  47. cudf_polars/experimental/benchmarks/pdsds_queries/q10.py +225 -0
  48. cudf_polars/experimental/benchmarks/pdsds_queries/q2.py +244 -0
  49. cudf_polars/experimental/benchmarks/pdsds_queries/q3.py +65 -0
  50. cudf_polars/experimental/benchmarks/pdsds_queries/q4.py +359 -0
  51. cudf_polars/experimental/benchmarks/pdsds_queries/q5.py +462 -0
  52. cudf_polars/experimental/benchmarks/pdsds_queries/q6.py +92 -0
  53. cudf_polars/experimental/benchmarks/pdsds_queries/q7.py +79 -0
  54. cudf_polars/experimental/benchmarks/pdsds_queries/q8.py +524 -0
  55. cudf_polars/experimental/benchmarks/pdsds_queries/q9.py +137 -0
  56. cudf_polars/experimental/benchmarks/pdsh.py +814 -0
  57. cudf_polars/experimental/benchmarks/utils.py +832 -0
  58. cudf_polars/experimental/dask_registers.py +200 -0
  59. cudf_polars/experimental/dispatch.py +156 -0
  60. cudf_polars/experimental/distinct.py +197 -0
  61. cudf_polars/experimental/explain.py +157 -0
  62. cudf_polars/experimental/expressions.py +590 -0
  63. cudf_polars/experimental/groupby.py +327 -0
  64. cudf_polars/experimental/io.py +943 -0
  65. cudf_polars/experimental/join.py +391 -0
  66. cudf_polars/experimental/parallel.py +423 -0
  67. cudf_polars/experimental/repartition.py +69 -0
  68. cudf_polars/experimental/scheduler.py +155 -0
  69. cudf_polars/experimental/select.py +188 -0
  70. cudf_polars/experimental/shuffle.py +354 -0
  71. cudf_polars/experimental/sort.py +609 -0
  72. cudf_polars/experimental/spilling.py +151 -0
  73. cudf_polars/experimental/statistics.py +795 -0
  74. cudf_polars/experimental/utils.py +169 -0
  75. cudf_polars/py.typed +0 -0
  76. cudf_polars/testing/__init__.py +8 -0
  77. cudf_polars/testing/asserts.py +448 -0
  78. cudf_polars/testing/io.py +122 -0
  79. cudf_polars/testing/plugin.py +236 -0
  80. cudf_polars/typing/__init__.py +219 -0
  81. cudf_polars/utils/__init__.py +8 -0
  82. cudf_polars/utils/config.py +741 -0
  83. cudf_polars/utils/conversion.py +40 -0
  84. cudf_polars/utils/dtypes.py +118 -0
  85. cudf_polars/utils/sorting.py +53 -0
  86. cudf_polars/utils/timer.py +39 -0
  87. cudf_polars/utils/versions.py +27 -0
  88. cudf_polars_cu13-25.10.0.dist-info/METADATA +136 -0
  89. cudf_polars_cu13-25.10.0.dist-info/RECORD +92 -0
  90. cudf_polars_cu13-25.10.0.dist-info/WHEEL +5 -0
  91. cudf_polars_cu13-25.10.0.dist-info/licenses/LICENSE +201 -0
  92. cudf_polars_cu13-25.10.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,741 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ """
5
+ Configuration utilities for the cudf-polars engine.
6
+
7
+ Most users will not construct these objects directly. Instead, you'll pass
8
+ keyword arguments to :class:`~polars.lazyframe.engine_config.GPUEngine`. The
9
+ majority of the options are passed as `**kwargs` and collected into the
10
+ configuration described below:
11
+
12
+ .. code-block:: python
13
+
14
+ >>> import polars as pl
15
+ >>> engine = pl.GPUEngine(
16
+ ... executor="streaming",
17
+ ... executor_options={"fallback_mode": "raise"}
18
+ ... )
19
+
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import dataclasses
25
+ import enum
26
+ import functools
27
+ import importlib.util
28
+ import json
29
+ import os
30
+ import warnings
31
+ from typing import TYPE_CHECKING, Literal, TypeVar
32
+
33
+ if TYPE_CHECKING:
34
+ from collections.abc import Callable
35
+
36
+ from typing_extensions import Self
37
+
38
+ import polars.lazyframe.engine_config
39
+
40
+
41
+ __all__ = [
42
+ "ConfigOptions",
43
+ "InMemoryExecutor",
44
+ "ParquetOptions",
45
+ "Scheduler",
46
+ "ShuffleMethod",
47
+ "StatsPlanningOptions",
48
+ "StreamingExecutor",
49
+ "StreamingFallbackMode",
50
+ ]
51
+
52
+
53
+ def _env_get_int(name: str, default: int) -> int:
54
+ try:
55
+ return int(os.getenv(name, default))
56
+ except (ValueError, TypeError): # pragma: no cover
57
+ return default # pragma: no cover
58
+
59
+
60
+ def get_total_device_memory() -> int | None:
61
+ """Return the total memory of the current device."""
62
+ import pynvml
63
+
64
+ try:
65
+ pynvml.nvmlInit()
66
+ index = os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")[0]
67
+ if index and not index.isnumeric(): # pragma: no cover
68
+ # This means device_index is UUID.
69
+ # This works for both MIG and non-MIG device UUIDs.
70
+ handle = pynvml.nvmlDeviceGetHandleByUUID(str.encode(index))
71
+ if pynvml.nvmlDeviceIsMigDeviceHandle(handle):
72
+ # Additionally get parent device handle
73
+ # if the device itself is a MIG instance
74
+ handle = pynvml.nvmlDeviceGetDeviceHandleFromMigDeviceHandle(handle)
75
+ else:
76
+ handle = pynvml.nvmlDeviceGetHandleByIndex(int(index))
77
+
78
+ return pynvml.nvmlDeviceGetMemoryInfo(handle).total
79
+
80
+ except pynvml.NVMLError_NotSupported: # pragma: no cover
81
+ # System doesn't have proper "GPU memory".
82
+ return None
83
+
84
+
85
+ @functools.cache
86
+ def rapidsmpf_single_available() -> bool: # pragma: no cover
87
+ """Query whether rapidsmpf is available as a single-process shuffle method."""
88
+ try:
89
+ return importlib.util.find_spec("rapidsmpf.integrations.single") is not None
90
+ except (ImportError, ValueError):
91
+ return False
92
+
93
+
94
+ @functools.cache
95
+ def rapidsmpf_distributed_available() -> bool: # pragma: no cover
96
+ """Query whether rapidsmpf is available as a distributed shuffle method."""
97
+ try:
98
+ return importlib.util.find_spec("rapidsmpf.integrations.dask") is not None
99
+ except (ImportError, ValueError):
100
+ return False
101
+
102
+
103
+ # TODO: Use enum.StrEnum when we drop Python 3.10
104
+
105
+
106
+ class StreamingFallbackMode(str, enum.Enum):
107
+ """
108
+ How the streaming executor handles operations that don't support multiple partitions.
109
+
110
+ Upon encountering an unsupported operation, the streaming executor will fall
111
+ back to using a single partition, which might use a large amount of memory.
112
+
113
+ * ``StreamingFallbackMode.WARN`` : Emit a warning and fall back to a single partition.
114
+ * ``StreamingFallbackMode.SILENT``: Silently fall back to a single partition.
115
+ * ``StreamingFallbackMode.RAISE`` : Raise an exception.
116
+ """
117
+
118
+ WARN = "warn"
119
+ RAISE = "raise"
120
+ SILENT = "silent"
121
+
122
+
123
+ class Scheduler(str, enum.Enum):
124
+ """
125
+ The scheduler to use for the streaming executor.
126
+
127
+ * ``Scheduler.SYNCHRONOUS`` : A zero-dependency, synchronous,
128
+ single-threaded scheduler.
129
+ * ``Scheduler.DISTRIBUTED`` : A Dask-based distributed scheduler.
130
+ Using this scheduler requires an active Dask cluster.
131
+ """
132
+
133
+ SYNCHRONOUS = "synchronous"
134
+ DISTRIBUTED = "distributed"
135
+
136
+
137
+ class ShuffleMethod(str, enum.Enum):
138
+ """
139
+ The method to use for shuffling data between workers with the streaming executor.
140
+
141
+ * ``ShuffleMethod.TASKS`` : Use the task-based shuffler.
142
+ * ``ShuffleMethod.RAPIDSMPF`` : Use the rapidsmpf shuffler.
143
+ * ``ShuffleMethod._RAPIDSMPF_SINGLE`` : Use the single-process rapidsmpf shuffler.
144
+
145
+ With :class:`cudf_polars.utils.config.StreamingExecutor`, the default of ``None``
146
+ will attempt to use ``ShuffleMethod.RAPIDSMPF`` for the distributed scheduler,
147
+ but will fall back to ``ShuffleMethod.TASKS`` if rapidsmpf is not installed.
148
+
149
+ The user should **not** specify ``ShuffleMethod._RAPIDSMPF_SINGLE`` directly.
150
+ A setting of ``ShuffleMethod.RAPIDSMPF`` will be converted to the single-process
151
+ shuffler automatically when the 'synchronous' scheduler is active.
152
+ """
153
+
154
+ TASKS = "tasks"
155
+ RAPIDSMPF = "rapidsmpf"
156
+ _RAPIDSMPF_SINGLE = "rapidsmpf-single"
157
+
158
+
159
+ T = TypeVar("T")
160
+
161
+
162
+ def _make_default_factory(
163
+ key: str, converter: Callable[[str], T], *, default: T
164
+ ) -> Callable[[], T]:
165
+ def default_factory() -> T:
166
+ v = os.environ.get(key)
167
+ if v is None:
168
+ return default
169
+ return converter(v)
170
+
171
+ return default_factory
172
+
173
+
174
+ def _bool_converter(v: str) -> bool:
175
+ lowered = v.lower()
176
+ if lowered in {"1", "true", "yes", "y"}:
177
+ return True
178
+ elif lowered in {"0", "false", "no", "n"}:
179
+ return False
180
+ else:
181
+ raise ValueError(f"Invalid boolean value: '{v}'")
182
+
183
+
184
+ @dataclasses.dataclass(frozen=True)
185
+ class ParquetOptions:
186
+ """
187
+ Configuration for the cudf-polars Parquet engine.
188
+
189
+ These options can be configured via environment variables
190
+ with the prefix ``CUDF_POLARS__PARQUET_OPTIONS__``.
191
+
192
+ Parameters
193
+ ----------
194
+ chunked
195
+ Whether to use libcudf's ``ChunkedParquetReader`` or ``ChunkedParquetWriter``
196
+ to read/write the parquet dataset in chunks. This is useful when reading/writing
197
+ very large parquet files.
198
+ n_output_chunks
199
+ Split the dataframe in ``n_output_chunks`` when using libcudf's ``ChunkedParquetWriter``.
200
+ chunk_read_limit
201
+ Limit on total number of bytes to be returned per read, or 0 if
202
+ there is no limit.
203
+ pass_read_limit
204
+ Limit on the amount of memory used for reading and decompressing data
205
+ or 0 if there is no limit.
206
+ max_footer_samples
207
+ Maximum number of file footers to sample for metadata. This
208
+ option is currently used by the streaming executor to gather
209
+ datasource statistics before generating a physical plan. Set to
210
+ 0 to avoid metadata sampling. Default is 3.
211
+ max_row_group_samples
212
+ Maximum number of row-groups to sample for unique-value statistics.
213
+ This option may be used by the streaming executor to optimize
214
+ the physical plan. Default is 1.
215
+
216
+ Set to 0 to avoid row-group sampling. Note that row-group sampling
217
+ will also be skipped if ``max_footer_samples`` is 0.
218
+ """
219
+
220
+ _env_prefix = "CUDF_POLARS__PARQUET_OPTIONS"
221
+
222
+ chunked: bool = dataclasses.field(
223
+ default_factory=_make_default_factory(
224
+ f"{_env_prefix}__CHUNKED", _bool_converter, default=True
225
+ )
226
+ )
227
+ n_output_chunks: int = dataclasses.field(
228
+ default_factory=_make_default_factory(
229
+ f"{_env_prefix}__N_OUTPUT_CHUNKS", int, default=1
230
+ )
231
+ )
232
+ chunk_read_limit: int = dataclasses.field(
233
+ default_factory=_make_default_factory(
234
+ f"{_env_prefix}__CHUNK_READ_LIMIT", int, default=0
235
+ )
236
+ )
237
+ pass_read_limit: int = dataclasses.field(
238
+ default_factory=_make_default_factory(
239
+ f"{_env_prefix}__PASS_READ_LIMIT", int, default=0
240
+ )
241
+ )
242
+ max_footer_samples: int = dataclasses.field(
243
+ default_factory=_make_default_factory(
244
+ f"{_env_prefix}__MAX_FOOTER_SAMPLES", int, default=3
245
+ )
246
+ )
247
+ max_row_group_samples: int = dataclasses.field(
248
+ default_factory=_make_default_factory(
249
+ f"{_env_prefix}__MAX_ROW_GROUP_SAMPLES", int, default=1
250
+ )
251
+ )
252
+
253
+ def __post_init__(self) -> None: # noqa: D105
254
+ if not isinstance(self.chunked, bool):
255
+ raise TypeError("chunked must be a bool")
256
+ if not isinstance(self.n_output_chunks, int):
257
+ raise TypeError("n_output_chunks must be an int")
258
+ if not isinstance(self.chunk_read_limit, int):
259
+ raise TypeError("chunk_read_limit must be an int")
260
+ if not isinstance(self.pass_read_limit, int):
261
+ raise TypeError("pass_read_limit must be an int")
262
+ if not isinstance(self.max_footer_samples, int):
263
+ raise TypeError("max_footer_samples must be an int")
264
+ if not isinstance(self.max_row_group_samples, int):
265
+ raise TypeError("max_row_group_samples must be an int")
266
+
267
+
268
+ def default_blocksize(scheduler: str) -> int:
269
+ """Return the default blocksize."""
270
+ device_size = get_total_device_memory()
271
+ if device_size is None: # pragma: no cover
272
+ # System doesn't have proper "GPU memory".
273
+ # Fall back to a conservative 1GB default.
274
+ return 1_000_000_000
275
+
276
+ if (
277
+ scheduler == "distributed"
278
+ or _env_get_int("POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY", default=1) == 0
279
+ ):
280
+ # Distributed execution requires a conservative
281
+ # blocksize for now. We are also more conservative
282
+ # when UVM is disabled.
283
+ blocksize = int(device_size * 0.025)
284
+ else:
285
+ # Single-GPU execution can lean on UVM to
286
+ # support a much larger blocksize.
287
+ blocksize = int(device_size * 0.0625)
288
+
289
+ # Use lower and upper bounds of 1GB and 10GB
290
+ return min(max(blocksize, 1_000_000_000), 10_000_000_000)
291
+
292
+
293
+ @dataclasses.dataclass(frozen=True)
294
+ class StatsPlanningOptions:
295
+ """
296
+ Configuration for statistics-based query planning.
297
+
298
+ These options can be configured via environment variables
299
+ with the prefix ``CUDF_POLARS__EXECUTOR__STATS_PLANNING__``.
300
+
301
+ Parameters
302
+ ----------
303
+ use_io_partitioning
304
+ Whether to use estimated file-size statistics to calculate
305
+ the ideal input-partition count for IO operations.
306
+ This option currently applies to Parquet data only.
307
+ Default is True.
308
+ use_reduction_planning
309
+ Whether to use estimated column statistics to calculate
310
+ the output-partition count for reduction operations
311
+ like `Distinct`, `GroupBy`, and `Select(unique)`.
312
+ Default is False.
313
+ use_join_heuristics
314
+ Whether to use join heuristics to estimate row-count
315
+ and unique-count statistics. Default is True.
316
+ These statistics may only be collected when they are
317
+ actually needed for query planning and when row-count
318
+ statistics are available for the underlying datasource
319
+ (e.g. Parquet and in-memory LazyFrame data).
320
+ use_sampling
321
+ Whether to sample real data to estimate unique-value
322
+ statistics. Default is True.
323
+ These statistics may only be collected when they are
324
+ actually needed for query planning, and when the
325
+ underlying datasource supports sampling (e.g. Parquet
326
+ and in-memory LazyFrame data).
327
+ default_selectivity
328
+ The default selectivity of a predicate.
329
+ Default is 0.8.
330
+ """
331
+
332
+ _env_prefix = "CUDF_POLARS__EXECUTOR__STATS_PLANNING"
333
+
334
+ use_io_partitioning: bool = dataclasses.field(
335
+ default_factory=_make_default_factory(
336
+ f"{_env_prefix}__USE_IO_PARTITIONING", _bool_converter, default=True
337
+ )
338
+ )
339
+ use_reduction_planning: bool = dataclasses.field(
340
+ default_factory=_make_default_factory(
341
+ f"{_env_prefix}__USE_REDUCTION_PLANNING", _bool_converter, default=False
342
+ )
343
+ )
344
+ use_join_heuristics: bool = dataclasses.field(
345
+ default_factory=_make_default_factory(
346
+ f"{_env_prefix}__USE_JOIN_HEURISTICS", _bool_converter, default=True
347
+ )
348
+ )
349
+ use_sampling: bool = dataclasses.field(
350
+ default_factory=_make_default_factory(
351
+ f"{_env_prefix}__USE_SAMPLING", _bool_converter, default=True
352
+ )
353
+ )
354
+ default_selectivity: float = dataclasses.field(
355
+ default_factory=_make_default_factory(
356
+ f"{_env_prefix}__DEFAULT_SELECTIVITY", float, default=0.8
357
+ )
358
+ )
359
+
360
+ def __post_init__(self) -> None: # noqa: D105
361
+ if not isinstance(self.use_io_partitioning, bool):
362
+ raise TypeError("use_io_partitioning must be a bool")
363
+ if not isinstance(self.use_reduction_planning, bool):
364
+ raise TypeError("use_reduction_planning must be a bool")
365
+ if not isinstance(self.use_join_heuristics, bool):
366
+ raise TypeError("use_join_heuristics must be a bool")
367
+ if not isinstance(self.use_sampling, bool):
368
+ raise TypeError("use_sampling must be a bool")
369
+ if not isinstance(self.default_selectivity, float):
370
+ raise TypeError("default_selectivity must be a float")
371
+
372
+
373
+ @dataclasses.dataclass(frozen=True, eq=True)
374
+ class StreamingExecutor:
375
+ """
376
+ Configuration for the cudf-polars streaming executor.
377
+
378
+ These options can be configured via environment variables
379
+ with the prefix ``CUDF_POLARS__EXECUTOR__``.
380
+
381
+ Parameters
382
+ ----------
383
+ scheduler
384
+ The scheduler to use for the streaming executor. ``Scheduler.SYNCHRONOUS``
385
+ by default.
386
+
387
+ Note ``scheduler="distributed"`` requires a Dask cluster to be running.
388
+ fallback_mode
389
+ How to handle errors when the GPU engine fails to execute a query.
390
+ ``StreamingFallbackMode.WARN`` by default.
391
+
392
+ This can be set using the ``CUDF_POLARS__EXECUTOR__FALLBACK_MODE``
393
+ environment variable.
394
+ max_rows_per_partition
395
+ The maximum number of rows to process per partition. 1_000_000 by default.
396
+ When the number of rows exceeds this value, the query will be split into
397
+ multiple partitions and executed in parallel.
398
+ unique_fraction
399
+ A dictionary mapping column names to floats between 0 and 1 (inclusive
400
+ on the right).
401
+
402
+ Each factor estimates the fractional number of unique values in the
403
+ column. By default, ``1.0`` is used for any column not included in
404
+ ``unique_fraction``.
405
+ target_partition_size
406
+ Target partition size, in bytes, for IO tasks. This configuration currently
407
+ controls how large parquet files are split into multiple partitions.
408
+ Files larger than ``target_partition_size`` bytes are split into multiple
409
+ partitions.
410
+
411
+ This can be set via
412
+
413
+ - keyword argument to ``polars.GPUEngine``
414
+ - the ``CUDF_POLARS__EXECUTOR__TARGET_PARTITION_SIZE`` environment variable
415
+
416
+ By default, cudf-polars uses a target partition size that's a fraction
417
+ of the device memory, where the fraction depends on the scheduler:
418
+
419
+ - distributed: 1/40th of the device memory
420
+ - synchronous: 1/16th of the device memory
421
+
422
+ The optional pynvml dependency is used to query the device memory size. If
423
+ pynvml is not available, a warning is emitted and the device size is assumed
424
+ to be 12 GiB.
425
+
426
+ groupby_n_ary
427
+ The factor by which the number of partitions is decreased when performing
428
+ a groupby on a partitioned column. For example, if a column has 64 partitions,
429
+ it will first be reduced to ``ceil(64 / 32) = 2`` partitions.
430
+
431
+ This is useful when the absolute number of partitions is large.
432
+ broadcast_join_limit
433
+ The maximum number of partitions to allow for the smaller table in
434
+ a broadcast join.
435
+ shuffle_method
436
+ The method to use for shuffling data between workers. Defaults to
437
+ 'rapidsmpf' for distributed scheduler if available (otherwise 'tasks'),
438
+ and 'tasks' for synchronous scheduler.
439
+ rapidsmpf_spill
440
+ Whether to wrap task arguments and output in objects that are
441
+ spillable by 'rapidsmpf'.
442
+ sink_to_directory
443
+ Whether multi-partition sink operations should write to a directory
444
+ rather than a single file. By default, this will be set to True for
445
+ the 'distributed' scheduler and False otherwise. The 'distrubuted'
446
+ scheduler does not currently support ``sink_to_directory=False``.
447
+ stats_planning
448
+ Options controlling statistics-based query planning. See
449
+ :class:`~cudf_polars.utils.config.StatsPlanningOptions` for more.
450
+
451
+ Notes
452
+ -----
453
+ The streaming executor does not currently support profiling a query via
454
+ the ``.profile()`` method. We recommend using nsys to profile queries
455
+ with the 'synchronous' scheduler and Dask's built-in profiling tools
456
+ with the 'distributed' scheduler.
457
+ """
458
+
459
+ _env_prefix = "CUDF_POLARS__EXECUTOR"
460
+
461
+ name: Literal["streaming"] = dataclasses.field(default="streaming", init=False)
462
+ scheduler: Scheduler = dataclasses.field(
463
+ default_factory=_make_default_factory(
464
+ f"{_env_prefix}__SCHEDULER",
465
+ Scheduler.__call__,
466
+ default=Scheduler.SYNCHRONOUS,
467
+ )
468
+ )
469
+ fallback_mode: StreamingFallbackMode = dataclasses.field(
470
+ default_factory=_make_default_factory(
471
+ f"{_env_prefix}__FALLBACK_MODE",
472
+ StreamingFallbackMode.__call__,
473
+ default=StreamingFallbackMode.WARN,
474
+ )
475
+ )
476
+ max_rows_per_partition: int = dataclasses.field(
477
+ default_factory=_make_default_factory(
478
+ f"{_env_prefix}__MAX_ROWS_PER_PARTITION", int, default=1_000_000
479
+ )
480
+ )
481
+ unique_fraction: dict[str, float] = dataclasses.field(
482
+ default_factory=_make_default_factory(
483
+ f"{_env_prefix}__UNIQUE_FRACTION", json.loads, default={}
484
+ )
485
+ )
486
+ target_partition_size: int = dataclasses.field(
487
+ default_factory=_make_default_factory(
488
+ f"{_env_prefix}__TARGET_PARTITION_SIZE", int, default=0
489
+ )
490
+ )
491
+ groupby_n_ary: int = dataclasses.field(
492
+ default_factory=_make_default_factory(
493
+ f"{_env_prefix}__GROUPBY_N_ARY", int, default=32
494
+ )
495
+ )
496
+ broadcast_join_limit: int = dataclasses.field(
497
+ default_factory=_make_default_factory(
498
+ f"{_env_prefix}__BROADCAST_JOIN_LIMIT", int, default=0
499
+ )
500
+ )
501
+ shuffle_method: ShuffleMethod = dataclasses.field(
502
+ default_factory=_make_default_factory(
503
+ f"{_env_prefix}__SHUFFLE_METHOD",
504
+ ShuffleMethod.__call__,
505
+ default=ShuffleMethod.TASKS,
506
+ )
507
+ )
508
+ rapidsmpf_spill: bool = dataclasses.field(
509
+ default_factory=_make_default_factory(
510
+ f"{_env_prefix}__RAPIDSMPF_SPILL", _bool_converter, default=False
511
+ )
512
+ )
513
+ sink_to_directory: bool | None = dataclasses.field(
514
+ default_factory=_make_default_factory(
515
+ f"{_env_prefix}__SINK_TO_DIRECTORY", _bool_converter, default=None
516
+ )
517
+ )
518
+ stats_planning: StatsPlanningOptions = dataclasses.field(
519
+ default_factory=StatsPlanningOptions
520
+ )
521
+
522
+ def __post_init__(self) -> None: # noqa: D105
523
+ # Handle shuffle_method defaults for streaming executor
524
+ if self.shuffle_method is None:
525
+ if self.scheduler == "distributed" and rapidsmpf_distributed_available():
526
+ # For distributed scheduler, prefer rapidsmpf if available
527
+ object.__setattr__(self, "shuffle_method", "rapidsmpf")
528
+ else:
529
+ # Otherwise, use task-based shuffle for now.
530
+ # TODO: Evaluate single-process shuffle by default.
531
+ object.__setattr__(self, "shuffle_method", "tasks")
532
+ elif self.shuffle_method == "rapidsmpf-single":
533
+ # The user should NOT specify "rapidsmpf-single" directly.
534
+ raise ValueError("rapidsmpf-single is not a supported shuffle method.")
535
+ elif self.shuffle_method == "rapidsmpf":
536
+ # Check that we have rapidsmpf installed
537
+ if (
538
+ self.scheduler == "distributed"
539
+ and not rapidsmpf_distributed_available()
540
+ ):
541
+ raise ValueError(
542
+ "rapidsmpf shuffle method requested, but rapidsmpf.integrations.dask is not installed."
543
+ )
544
+ elif self.scheduler == "synchronous" and not rapidsmpf_single_available():
545
+ raise ValueError(
546
+ "rapidsmpf shuffle method requested, but rapidsmpf is not installed."
547
+ )
548
+ # Select "rapidsmpf-single" for the synchronous
549
+ if self.scheduler == "synchronous":
550
+ object.__setattr__(self, "shuffle_method", "rapidsmpf-single")
551
+
552
+ # frozen dataclass, so use object.__setattr__
553
+ object.__setattr__(
554
+ self, "fallback_mode", StreamingFallbackMode(self.fallback_mode)
555
+ )
556
+ if self.target_partition_size == 0:
557
+ object.__setattr__(
558
+ self, "target_partition_size", default_blocksize(self.scheduler)
559
+ )
560
+ if self.broadcast_join_limit == 0:
561
+ object.__setattr__(
562
+ self,
563
+ "broadcast_join_limit",
564
+ # Usually better to avoid shuffling for single gpu
565
+ 2 if self.scheduler == "distributed" else 32,
566
+ )
567
+ object.__setattr__(self, "scheduler", Scheduler(self.scheduler))
568
+ object.__setattr__(self, "shuffle_method", ShuffleMethod(self.shuffle_method))
569
+
570
+ # Make sure stats_planning is a dataclass
571
+ if isinstance(self.stats_planning, dict):
572
+ object.__setattr__(
573
+ self,
574
+ "stats_planning",
575
+ StatsPlanningOptions(**self.stats_planning),
576
+ )
577
+
578
+ if self.scheduler == "distributed":
579
+ if self.sink_to_directory is False:
580
+ raise ValueError(
581
+ "The distributed scheduler requires sink_to_directory=True"
582
+ )
583
+ object.__setattr__(self, "sink_to_directory", True)
584
+ elif self.sink_to_directory is None:
585
+ object.__setattr__(self, "sink_to_directory", False)
586
+
587
+ # Type / value check everything else
588
+ if not isinstance(self.max_rows_per_partition, int):
589
+ raise TypeError("max_rows_per_partition must be an int")
590
+ if not isinstance(self.unique_fraction, dict):
591
+ raise TypeError("unique_fraction must be a dict of column name to float")
592
+ if not isinstance(self.target_partition_size, int):
593
+ raise TypeError("target_partition_size must be an int")
594
+ if not isinstance(self.groupby_n_ary, int):
595
+ raise TypeError("groupby_n_ary must be an int")
596
+ if not isinstance(self.broadcast_join_limit, int):
597
+ raise TypeError("broadcast_join_limit must be an int")
598
+ if not isinstance(self.rapidsmpf_spill, bool):
599
+ raise TypeError("rapidsmpf_spill must be bool")
600
+ if not isinstance(self.sink_to_directory, bool):
601
+ raise TypeError("sink_to_directory must be bool")
602
+
603
+ # RapidsMPF spill is only supported for the distributed scheduler for now.
604
+ # This is because the spilling API is still within the RMPF-Dask integration.
605
+ # (See https://github.com/rapidsai/rapidsmpf/issues/439)
606
+ if self.scheduler == "synchronous" and self.rapidsmpf_spill: # pragma: no cover
607
+ raise ValueError(
608
+ "rapidsmpf_spill is not supported for the synchronous scheduler."
609
+ )
610
+
611
+ def __hash__(self) -> int: # noqa: D105
612
+ # cardinality factory, a dict, isn't natively hashable. We'll dump it
613
+ # to json and hash that.
614
+ d = dataclasses.asdict(self)
615
+ d["unique_fraction"] = json.dumps(d["unique_fraction"])
616
+ d["stats_planning"] = json.dumps(d["stats_planning"])
617
+ return hash(tuple(sorted(d.items())))
618
+
619
+
620
+ @dataclasses.dataclass(frozen=True, eq=True)
621
+ class InMemoryExecutor:
622
+ """
623
+ Configuration for the cudf-polars in-memory executor.
624
+
625
+ Parameters
626
+ ----------
627
+ scheduler:
628
+ The scheduler to use for the in-memory executor. Currently
629
+ only ``Scheduler.SYNCHRONOUS`` is supported for the in-memory executor.
630
+ """
631
+
632
+ name: Literal["in-memory"] = dataclasses.field(default="in-memory", init=False)
633
+
634
+
635
+ @dataclasses.dataclass(frozen=True, eq=True)
636
+ class ConfigOptions:
637
+ """
638
+ Configuration for the polars GPUEngine.
639
+
640
+ Parameters
641
+ ----------
642
+ raise_on_fail
643
+ Whether to raise an exception when the GPU engine cannot execute a
644
+ query. ``False`` by default.
645
+ parquet_options
646
+ Options controlling parquet file reading and writing. See
647
+ :class:`~cudf_polars.utils.config.ParquetOptions` for more.
648
+ executor
649
+ The executor to use for the GPU engine. See :class:`~cudf_polars.utils.config.StreamingExecutor`
650
+ and :class:`~cudf_polars.utils.config.InMemoryExecutor` for more.
651
+ device
652
+ The GPU used to run the query. If not provided, the
653
+ query uses the current CUDA device.
654
+ """
655
+
656
+ raise_on_fail: bool = False
657
+ parquet_options: ParquetOptions = dataclasses.field(default_factory=ParquetOptions)
658
+ executor: StreamingExecutor | InMemoryExecutor = dataclasses.field(
659
+ default_factory=StreamingExecutor
660
+ )
661
+ device: int | None = None
662
+
663
+ @classmethod
664
+ def from_polars_engine(
665
+ cls, engine: polars.lazyframe.engine_config.GPUEngine
666
+ ) -> Self:
667
+ """Create a :class:`ConfigOptions` from a :class:`~polars.lazyframe.engine_config.GPUEngine`."""
668
+ # these are the valid top-level keys in the engine.config that
669
+ # the user passes as **kwargs to GPUEngine.
670
+ valid_options = {
671
+ "executor",
672
+ "executor_options",
673
+ "parquet_options",
674
+ "raise_on_fail",
675
+ }
676
+
677
+ extra_options = set(engine.config.keys()) - valid_options
678
+ if extra_options:
679
+ raise TypeError(f"Unsupported executor_options: {extra_options}")
680
+
681
+ env_prefix = "CUDF_POLARS"
682
+ user_executor = engine.config.get("executor")
683
+ if user_executor is None:
684
+ user_executor = os.environ.get(f"{env_prefix}__EXECUTOR", "streaming")
685
+ user_executor_options = engine.config.get("executor_options", {})
686
+ user_parquet_options = engine.config.get("parquet_options", {})
687
+ # This is set in polars, and so can't be overridden by the environment
688
+ user_raise_on_fail = engine.config.get("raise_on_fail", False)
689
+
690
+ # Backward compatibility for "cardinality_factor"
691
+ # TODO: Remove this in 25.10
692
+ if "cardinality_factor" in user_executor_options:
693
+ warnings.warn(
694
+ "The 'cardinality_factor' configuration is deprecated. "
695
+ "Please use 'unique_fraction' instead.",
696
+ FutureWarning,
697
+ stacklevel=2,
698
+ )
699
+ cardinality_factor = user_executor_options.pop("cardinality_factor")
700
+ if "unique_fraction" not in user_executor_options:
701
+ user_executor_options["unique_fraction"] = cardinality_factor
702
+
703
+ # These are user-provided options, so we need to actually validate
704
+ # them.
705
+
706
+ if user_executor not in {"in-memory", "streaming"}:
707
+ raise ValueError(f"Unknown executor '{user_executor}'")
708
+
709
+ if not isinstance(user_raise_on_fail, bool):
710
+ raise TypeError("GPUEngine option 'raise_on_fail' must be a boolean.")
711
+
712
+ executor: InMemoryExecutor | StreamingExecutor
713
+
714
+ match user_executor:
715
+ case "in-memory":
716
+ executor = InMemoryExecutor(**user_executor_options)
717
+ case "streaming":
718
+ user_executor_options = user_executor_options.copy()
719
+ # Handle the interaction between the default shuffle method, the
720
+ # scheduler, and whether rapidsmpf is available.
721
+ env_shuffle_method = os.environ.get(
722
+ "CUDF_POLARS__EXECUTOR__SHUFFLE_METHOD", None
723
+ )
724
+ if env_shuffle_method is not None:
725
+ shuffle_method_default = ShuffleMethod(env_shuffle_method)
726
+ else:
727
+ shuffle_method_default = None
728
+
729
+ user_executor_options.setdefault(
730
+ "shuffle_method", shuffle_method_default
731
+ )
732
+ executor = StreamingExecutor(**user_executor_options)
733
+ case _: # pragma: no cover; Unreachable
734
+ raise ValueError(f"Unsupported executor: {user_executor}")
735
+
736
+ return cls(
737
+ raise_on_fail=user_raise_on_fail,
738
+ parquet_options=ParquetOptions(**user_parquet_options),
739
+ executor=executor,
740
+ device=engine.device,
741
+ )