cudf-polars-cu13 25.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudf_polars/GIT_COMMIT +1 -0
- cudf_polars/VERSION +1 -0
- cudf_polars/__init__.py +28 -0
- cudf_polars/_version.py +21 -0
- cudf_polars/callback.py +318 -0
- cudf_polars/containers/__init__.py +13 -0
- cudf_polars/containers/column.py +495 -0
- cudf_polars/containers/dataframe.py +361 -0
- cudf_polars/containers/datatype.py +137 -0
- cudf_polars/dsl/__init__.py +8 -0
- cudf_polars/dsl/expr.py +66 -0
- cudf_polars/dsl/expressions/__init__.py +8 -0
- cudf_polars/dsl/expressions/aggregation.py +226 -0
- cudf_polars/dsl/expressions/base.py +272 -0
- cudf_polars/dsl/expressions/binaryop.py +120 -0
- cudf_polars/dsl/expressions/boolean.py +326 -0
- cudf_polars/dsl/expressions/datetime.py +271 -0
- cudf_polars/dsl/expressions/literal.py +97 -0
- cudf_polars/dsl/expressions/rolling.py +643 -0
- cudf_polars/dsl/expressions/selection.py +74 -0
- cudf_polars/dsl/expressions/slicing.py +46 -0
- cudf_polars/dsl/expressions/sorting.py +85 -0
- cudf_polars/dsl/expressions/string.py +1002 -0
- cudf_polars/dsl/expressions/struct.py +137 -0
- cudf_polars/dsl/expressions/ternary.py +49 -0
- cudf_polars/dsl/expressions/unary.py +517 -0
- cudf_polars/dsl/ir.py +2607 -0
- cudf_polars/dsl/nodebase.py +164 -0
- cudf_polars/dsl/to_ast.py +359 -0
- cudf_polars/dsl/tracing.py +16 -0
- cudf_polars/dsl/translate.py +939 -0
- cudf_polars/dsl/traversal.py +224 -0
- cudf_polars/dsl/utils/__init__.py +8 -0
- cudf_polars/dsl/utils/aggregations.py +481 -0
- cudf_polars/dsl/utils/groupby.py +98 -0
- cudf_polars/dsl/utils/naming.py +34 -0
- cudf_polars/dsl/utils/replace.py +61 -0
- cudf_polars/dsl/utils/reshape.py +74 -0
- cudf_polars/dsl/utils/rolling.py +121 -0
- cudf_polars/dsl/utils/windows.py +192 -0
- cudf_polars/experimental/__init__.py +8 -0
- cudf_polars/experimental/base.py +386 -0
- cudf_polars/experimental/benchmarks/__init__.py +4 -0
- cudf_polars/experimental/benchmarks/pdsds.py +220 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/__init__.py +4 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q1.py +88 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q10.py +225 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q2.py +244 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q3.py +65 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q4.py +359 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q5.py +462 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q6.py +92 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q7.py +79 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q8.py +524 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q9.py +137 -0
- cudf_polars/experimental/benchmarks/pdsh.py +814 -0
- cudf_polars/experimental/benchmarks/utils.py +832 -0
- cudf_polars/experimental/dask_registers.py +200 -0
- cudf_polars/experimental/dispatch.py +156 -0
- cudf_polars/experimental/distinct.py +197 -0
- cudf_polars/experimental/explain.py +157 -0
- cudf_polars/experimental/expressions.py +590 -0
- cudf_polars/experimental/groupby.py +327 -0
- cudf_polars/experimental/io.py +943 -0
- cudf_polars/experimental/join.py +391 -0
- cudf_polars/experimental/parallel.py +423 -0
- cudf_polars/experimental/repartition.py +69 -0
- cudf_polars/experimental/scheduler.py +155 -0
- cudf_polars/experimental/select.py +188 -0
- cudf_polars/experimental/shuffle.py +354 -0
- cudf_polars/experimental/sort.py +609 -0
- cudf_polars/experimental/spilling.py +151 -0
- cudf_polars/experimental/statistics.py +795 -0
- cudf_polars/experimental/utils.py +169 -0
- cudf_polars/py.typed +0 -0
- cudf_polars/testing/__init__.py +8 -0
- cudf_polars/testing/asserts.py +448 -0
- cudf_polars/testing/io.py +122 -0
- cudf_polars/testing/plugin.py +236 -0
- cudf_polars/typing/__init__.py +219 -0
- cudf_polars/utils/__init__.py +8 -0
- cudf_polars/utils/config.py +741 -0
- cudf_polars/utils/conversion.py +40 -0
- cudf_polars/utils/dtypes.py +118 -0
- cudf_polars/utils/sorting.py +53 -0
- cudf_polars/utils/timer.py +39 -0
- cudf_polars/utils/versions.py +27 -0
- cudf_polars_cu13-25.10.0.dist-info/METADATA +136 -0
- cudf_polars_cu13-25.10.0.dist-info/RECORD +92 -0
- cudf_polars_cu13-25.10.0.dist-info/WHEEL +5 -0
- cudf_polars_cu13-25.10.0.dist-info/licenses/LICENSE +201 -0
- cudf_polars_cu13-25.10.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
"""Spilling in multi-partition Dask execution using RAPIDSMPF."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import TYPE_CHECKING, Any
|
|
8
|
+
|
|
9
|
+
from dask.sizeof import sizeof
|
|
10
|
+
from distributed import get_worker
|
|
11
|
+
from rapidsmpf.buffer.buffer import MemoryType
|
|
12
|
+
from rapidsmpf.integrations.dask.core import get_worker_context
|
|
13
|
+
from rapidsmpf.integrations.dask.spilling import SpillableWrapper
|
|
14
|
+
|
|
15
|
+
from cudf_polars.containers import DataFrame
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from collections.abc import Callable, MutableMapping
|
|
19
|
+
from typing import Any
|
|
20
|
+
|
|
21
|
+
from cudf_polars.utils.config import ConfigOptions
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def wrap_arg(obj: Any) -> Any:
|
|
25
|
+
"""
|
|
26
|
+
Make `obj` spillable if it is a DataFrame.
|
|
27
|
+
|
|
28
|
+
Parameters
|
|
29
|
+
----------
|
|
30
|
+
obj
|
|
31
|
+
The object to be wrapped (if it is a DataFrame).
|
|
32
|
+
|
|
33
|
+
Returns
|
|
34
|
+
-------
|
|
35
|
+
A SpillableWrapper if obj is a DataFrame, otherwise the original object.
|
|
36
|
+
"""
|
|
37
|
+
if isinstance(obj, DataFrame):
|
|
38
|
+
return SpillableWrapper(on_device=obj)
|
|
39
|
+
return obj
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def unwrap_arg(obj: Any) -> Any:
|
|
43
|
+
"""
|
|
44
|
+
Unwraps a SpillableWrapper to retrieve the original object.
|
|
45
|
+
|
|
46
|
+
Parameters
|
|
47
|
+
----------
|
|
48
|
+
obj
|
|
49
|
+
The object to be unwrapped.
|
|
50
|
+
|
|
51
|
+
Returns
|
|
52
|
+
-------
|
|
53
|
+
The unwrapped obj is a SpillableWrapper, otherwise the original object.
|
|
54
|
+
"""
|
|
55
|
+
if isinstance(obj, SpillableWrapper):
|
|
56
|
+
return obj.unspill()
|
|
57
|
+
return obj
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def wrap_func_spillable(
|
|
61
|
+
func: Callable,
|
|
62
|
+
*,
|
|
63
|
+
make_func_output_spillable: bool,
|
|
64
|
+
target_partition_size: int,
|
|
65
|
+
) -> Callable:
|
|
66
|
+
"""
|
|
67
|
+
Wraps a function to handle spillable DataFrames.
|
|
68
|
+
|
|
69
|
+
Parameters
|
|
70
|
+
----------
|
|
71
|
+
func
|
|
72
|
+
The function to be wrapped.
|
|
73
|
+
make_func_output_spillable
|
|
74
|
+
Whether to wrap the function's output in a SpillableWrapper.
|
|
75
|
+
target_partition_size
|
|
76
|
+
Target byte size for IO tasks.
|
|
77
|
+
|
|
78
|
+
Returns
|
|
79
|
+
-------
|
|
80
|
+
A wrapped function that processes spillable DataFrames.
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
def wrapper(*args: Any) -> Any:
|
|
84
|
+
# Make headroom before executing the task
|
|
85
|
+
headroom = 0
|
|
86
|
+
probable_io_task = True
|
|
87
|
+
for arg in args:
|
|
88
|
+
if isinstance(arg, SpillableWrapper):
|
|
89
|
+
if arg.mem_type() == MemoryType.HOST:
|
|
90
|
+
headroom += sizeof(arg._on_host)
|
|
91
|
+
probable_io_task = False
|
|
92
|
+
if probable_io_task:
|
|
93
|
+
# Likely an IO task - Assume we need target_partition_size
|
|
94
|
+
headroom = target_partition_size
|
|
95
|
+
if headroom > 128_000_000: # Don't waste time on smaller data
|
|
96
|
+
ctx = get_worker_context(get_worker())
|
|
97
|
+
with ctx.lock:
|
|
98
|
+
ctx.br.spill_manager.spill_to_make_headroom(headroom=headroom)
|
|
99
|
+
|
|
100
|
+
ret: Any = func(*(unwrap_arg(arg) for arg in args))
|
|
101
|
+
if make_func_output_spillable:
|
|
102
|
+
ret = wrap_arg(ret)
|
|
103
|
+
return ret
|
|
104
|
+
|
|
105
|
+
return wrapper
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def wrap_dataframe_in_spillable(
|
|
109
|
+
graph: MutableMapping[Any, Any],
|
|
110
|
+
ignore_key: str | tuple[str, int],
|
|
111
|
+
config_options: ConfigOptions,
|
|
112
|
+
) -> MutableMapping[Any, Any]:
|
|
113
|
+
"""
|
|
114
|
+
Wraps functions within a task graph to handle spillable DataFrames.
|
|
115
|
+
|
|
116
|
+
Only supports flat task graphs where each DataFrame can be found in the
|
|
117
|
+
outermost level. Currently, this is true for all cudf-polars task graphs.
|
|
118
|
+
|
|
119
|
+
Parameters
|
|
120
|
+
----------
|
|
121
|
+
graph
|
|
122
|
+
Task graph.
|
|
123
|
+
ignore_key
|
|
124
|
+
The key to ignore when wrapping function, typically the key of the
|
|
125
|
+
output node.
|
|
126
|
+
config_options
|
|
127
|
+
GPUEngine configuration options.
|
|
128
|
+
|
|
129
|
+
Returns
|
|
130
|
+
-------
|
|
131
|
+
A new task graph with wrapped functions.
|
|
132
|
+
"""
|
|
133
|
+
assert config_options.executor.name == "streaming", (
|
|
134
|
+
"'in-memory' executor not supported in 'wrap_dataframe_in_spillable'"
|
|
135
|
+
)
|
|
136
|
+
target_partition_size = config_options.executor.target_partition_size
|
|
137
|
+
|
|
138
|
+
ret = {}
|
|
139
|
+
for key, task in graph.items():
|
|
140
|
+
assert isinstance(task, tuple)
|
|
141
|
+
ret[key] = tuple(
|
|
142
|
+
wrap_func_spillable(
|
|
143
|
+
a,
|
|
144
|
+
make_func_output_spillable=key != ignore_key,
|
|
145
|
+
target_partition_size=target_partition_size,
|
|
146
|
+
)
|
|
147
|
+
if callable(a)
|
|
148
|
+
else a
|
|
149
|
+
for a in task
|
|
150
|
+
)
|
|
151
|
+
return ret
|