cudf-polars-cu13 25.10.0__py3-none-any.whl → 26.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudf_polars/GIT_COMMIT +1 -1
- cudf_polars/VERSION +1 -1
- cudf_polars/callback.py +60 -15
- cudf_polars/containers/column.py +137 -77
- cudf_polars/containers/dataframe.py +123 -34
- cudf_polars/containers/datatype.py +134 -13
- cudf_polars/dsl/expr.py +0 -2
- cudf_polars/dsl/expressions/aggregation.py +80 -28
- cudf_polars/dsl/expressions/binaryop.py +34 -14
- cudf_polars/dsl/expressions/boolean.py +110 -37
- cudf_polars/dsl/expressions/datetime.py +59 -30
- cudf_polars/dsl/expressions/literal.py +11 -5
- cudf_polars/dsl/expressions/rolling.py +460 -119
- cudf_polars/dsl/expressions/selection.py +9 -8
- cudf_polars/dsl/expressions/slicing.py +1 -1
- cudf_polars/dsl/expressions/string.py +256 -114
- cudf_polars/dsl/expressions/struct.py +19 -7
- cudf_polars/dsl/expressions/ternary.py +33 -3
- cudf_polars/dsl/expressions/unary.py +126 -64
- cudf_polars/dsl/ir.py +1053 -350
- cudf_polars/dsl/to_ast.py +30 -13
- cudf_polars/dsl/tracing.py +194 -0
- cudf_polars/dsl/translate.py +307 -107
- cudf_polars/dsl/utils/aggregations.py +43 -30
- cudf_polars/dsl/utils/reshape.py +14 -2
- cudf_polars/dsl/utils/rolling.py +12 -8
- cudf_polars/dsl/utils/windows.py +35 -20
- cudf_polars/experimental/base.py +55 -2
- cudf_polars/experimental/benchmarks/pdsds.py +12 -126
- cudf_polars/experimental/benchmarks/pdsh.py +792 -2
- cudf_polars/experimental/benchmarks/utils.py +596 -39
- cudf_polars/experimental/dask_registers.py +47 -20
- cudf_polars/experimental/dispatch.py +9 -3
- cudf_polars/experimental/distinct.py +2 -0
- cudf_polars/experimental/explain.py +15 -2
- cudf_polars/experimental/expressions.py +30 -15
- cudf_polars/experimental/groupby.py +25 -4
- cudf_polars/experimental/io.py +156 -124
- cudf_polars/experimental/join.py +53 -23
- cudf_polars/experimental/parallel.py +68 -19
- cudf_polars/experimental/rapidsmpf/__init__.py +8 -0
- cudf_polars/experimental/rapidsmpf/collectives/__init__.py +9 -0
- cudf_polars/experimental/rapidsmpf/collectives/allgather.py +90 -0
- cudf_polars/experimental/rapidsmpf/collectives/common.py +96 -0
- cudf_polars/experimental/rapidsmpf/collectives/shuffle.py +253 -0
- cudf_polars/experimental/rapidsmpf/core.py +488 -0
- cudf_polars/experimental/rapidsmpf/dask.py +172 -0
- cudf_polars/experimental/rapidsmpf/dispatch.py +153 -0
- cudf_polars/experimental/rapidsmpf/io.py +696 -0
- cudf_polars/experimental/rapidsmpf/join.py +322 -0
- cudf_polars/experimental/rapidsmpf/lower.py +74 -0
- cudf_polars/experimental/rapidsmpf/nodes.py +735 -0
- cudf_polars/experimental/rapidsmpf/repartition.py +216 -0
- cudf_polars/experimental/rapidsmpf/union.py +115 -0
- cudf_polars/experimental/rapidsmpf/utils.py +374 -0
- cudf_polars/experimental/repartition.py +9 -2
- cudf_polars/experimental/select.py +177 -14
- cudf_polars/experimental/shuffle.py +46 -12
- cudf_polars/experimental/sort.py +100 -26
- cudf_polars/experimental/spilling.py +1 -1
- cudf_polars/experimental/statistics.py +24 -5
- cudf_polars/experimental/utils.py +25 -7
- cudf_polars/testing/asserts.py +13 -8
- cudf_polars/testing/io.py +2 -1
- cudf_polars/testing/plugin.py +93 -17
- cudf_polars/typing/__init__.py +86 -32
- cudf_polars/utils/config.py +473 -58
- cudf_polars/utils/cuda_stream.py +70 -0
- cudf_polars/utils/versions.py +5 -4
- cudf_polars_cu13-26.2.0.dist-info/METADATA +181 -0
- cudf_polars_cu13-26.2.0.dist-info/RECORD +108 -0
- {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/WHEEL +1 -1
- cudf_polars_cu13-25.10.0.dist-info/METADATA +0 -136
- cudf_polars_cu13-25.10.0.dist-info/RECORD +0 -92
- {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/licenses/LICENSE +0 -0
- {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/top_level.txt +0 -0
cudf_polars/GIT_COMMIT
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
|
|
1
|
+
9782a269e689140d2b00b5172a93056bdf19e8c2
|
cudf_polars/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
|
|
1
|
+
26.02.000
|
cudf_polars/callback.py
CHANGED
|
@@ -11,6 +11,7 @@ import textwrap
|
|
|
11
11
|
import time
|
|
12
12
|
import warnings
|
|
13
13
|
from functools import cache, partial
|
|
14
|
+
from threading import Lock
|
|
14
15
|
from typing import TYPE_CHECKING, Literal, overload
|
|
15
16
|
|
|
16
17
|
import nvtx
|
|
@@ -22,9 +23,14 @@ import pylibcudf
|
|
|
22
23
|
import rmm
|
|
23
24
|
from rmm._cuda import gpu
|
|
24
25
|
|
|
26
|
+
import cudf_polars.dsl.tracing
|
|
27
|
+
from cudf_polars.dsl.ir import IRExecutionContext
|
|
25
28
|
from cudf_polars.dsl.tracing import CUDF_POLARS_NVTX_DOMAIN
|
|
26
29
|
from cudf_polars.dsl.translate import Translator
|
|
27
|
-
from cudf_polars.utils.config import
|
|
30
|
+
from cudf_polars.utils.config import (
|
|
31
|
+
_env_get_int,
|
|
32
|
+
get_total_device_memory,
|
|
33
|
+
)
|
|
28
34
|
from cudf_polars.utils.timer import Timer
|
|
29
35
|
|
|
30
36
|
if TYPE_CHECKING:
|
|
@@ -35,7 +41,7 @@ if TYPE_CHECKING:
|
|
|
35
41
|
|
|
36
42
|
from cudf_polars.dsl.ir import IR
|
|
37
43
|
from cudf_polars.typing import NodeTraverser
|
|
38
|
-
from cudf_polars.utils.config import ConfigOptions
|
|
44
|
+
from cudf_polars.utils.config import ConfigOptions, MemoryResourceConfig
|
|
39
45
|
|
|
40
46
|
__all__: list[str] = ["execute_with_cudf"]
|
|
41
47
|
|
|
@@ -44,6 +50,7 @@ __all__: list[str] = ["execute_with_cudf"]
|
|
|
44
50
|
def default_memory_resource(
|
|
45
51
|
device: int,
|
|
46
52
|
cuda_managed_memory: bool, # noqa: FBT001
|
|
53
|
+
memory_resource_config: MemoryResourceConfig | None,
|
|
47
54
|
) -> rmm.mr.DeviceMemoryResource:
|
|
48
55
|
"""
|
|
49
56
|
Return the default memory resource for cudf-polars.
|
|
@@ -55,6 +62,9 @@ def default_memory_resource(
|
|
|
55
62
|
the active device when this function is called.
|
|
56
63
|
cuda_managed_memory
|
|
57
64
|
Whether to use managed memory or not.
|
|
65
|
+
memory_resource_config
|
|
66
|
+
Memory resource configuration to use. If ``None``, the default
|
|
67
|
+
memory resource is used.
|
|
58
68
|
|
|
59
69
|
Returns
|
|
60
70
|
-------
|
|
@@ -64,7 +74,9 @@ def default_memory_resource(
|
|
|
64
74
|
else, an async pool resource is returned.
|
|
65
75
|
"""
|
|
66
76
|
try:
|
|
67
|
-
if
|
|
77
|
+
if memory_resource_config is not None:
|
|
78
|
+
mr = memory_resource_config.create_memory_resource()
|
|
79
|
+
elif (
|
|
68
80
|
cuda_managed_memory
|
|
69
81
|
and pylibcudf.utils._is_concurrent_managed_access_supported()
|
|
70
82
|
):
|
|
@@ -89,7 +101,7 @@ def default_memory_resource(
|
|
|
89
101
|
):
|
|
90
102
|
raise ComputeError(
|
|
91
103
|
"GPU engine requested, but incorrect cudf-polars package installed. "
|
|
92
|
-
"cudf-polars requires CUDA 12.
|
|
104
|
+
"cudf-polars requires CUDA 12.2+ to installed."
|
|
93
105
|
) from None
|
|
94
106
|
else:
|
|
95
107
|
raise
|
|
@@ -100,6 +112,7 @@ def default_memory_resource(
|
|
|
100
112
|
@contextlib.contextmanager
|
|
101
113
|
def set_memory_resource(
|
|
102
114
|
mr: rmm.mr.DeviceMemoryResource | None,
|
|
115
|
+
memory_resource_config: MemoryResourceConfig | None,
|
|
103
116
|
) -> Generator[rmm.mr.DeviceMemoryResource, None, None]:
|
|
104
117
|
"""
|
|
105
118
|
Set the current memory resource for an execution block.
|
|
@@ -109,6 +122,9 @@ def set_memory_resource(
|
|
|
109
122
|
mr
|
|
110
123
|
Memory resource to use. If `None`, calls :func:`default_memory_resource`
|
|
111
124
|
to obtain an mr on the currently active device.
|
|
125
|
+
memory_resource_config
|
|
126
|
+
Memory resource configuration to use when a concrete memory resource.
|
|
127
|
+
is not provided. If ``None``, the default memory resource is used.
|
|
112
128
|
|
|
113
129
|
Returns
|
|
114
130
|
-------
|
|
@@ -132,7 +148,14 @@ def set_memory_resource(
|
|
|
132
148
|
)
|
|
133
149
|
!= 0
|
|
134
150
|
),
|
|
151
|
+
memory_resource_config=memory_resource_config,
|
|
135
152
|
)
|
|
153
|
+
|
|
154
|
+
if (
|
|
155
|
+
cudf_polars.dsl.tracing.LOG_TRACES
|
|
156
|
+
): # pragma: no cover; requires CUDF_POLARS_LOG_TRACES=1
|
|
157
|
+
mr = rmm.mr.StatisticsResourceAdaptor(mr)
|
|
158
|
+
|
|
136
159
|
rmm.mr.set_current_device_resource(mr)
|
|
137
160
|
try:
|
|
138
161
|
yield mr
|
|
@@ -140,6 +163,11 @@ def set_memory_resource(
|
|
|
140
163
|
rmm.mr.set_current_device_resource(previous)
|
|
141
164
|
|
|
142
165
|
|
|
166
|
+
# libcudf doesn't support executing on multiple devices from within the same process.
|
|
167
|
+
SEEN_DEVICE = None
|
|
168
|
+
SEEN_DEVICE_LOCK = Lock()
|
|
169
|
+
|
|
170
|
+
|
|
143
171
|
@contextlib.contextmanager
|
|
144
172
|
def set_device(device: int | None) -> Generator[int, None, None]:
|
|
145
173
|
"""
|
|
@@ -158,13 +186,28 @@ def set_device(device: int | None) -> Generator[int, None, None]:
|
|
|
158
186
|
-----
|
|
159
187
|
At exit, the device is restored to whatever was current at entry.
|
|
160
188
|
"""
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
189
|
+
global SEEN_DEVICE # noqa: PLW0603
|
|
190
|
+
current: int = gpu.getDevice()
|
|
191
|
+
to_use = device if device is not None else current
|
|
192
|
+
with SEEN_DEVICE_LOCK:
|
|
193
|
+
if (
|
|
194
|
+
SEEN_DEVICE is not None and to_use != SEEN_DEVICE
|
|
195
|
+
): # pragma: no cover; requires multiple GPUs in CI
|
|
196
|
+
raise RuntimeError(
|
|
197
|
+
"cudf-polars does not support running queries on "
|
|
198
|
+
"multiple devices in the same process. "
|
|
199
|
+
f"A previous query used device-{SEEN_DEVICE}, "
|
|
200
|
+
f"the current query is using device-{to_use}."
|
|
201
|
+
)
|
|
202
|
+
SEEN_DEVICE = to_use
|
|
203
|
+
if to_use != current:
|
|
204
|
+
gpu.setDevice(to_use)
|
|
205
|
+
try:
|
|
206
|
+
yield to_use
|
|
207
|
+
finally:
|
|
208
|
+
gpu.setDevice(current)
|
|
209
|
+
else:
|
|
210
|
+
yield to_use
|
|
168
211
|
|
|
169
212
|
|
|
170
213
|
@overload
|
|
@@ -211,14 +254,16 @@ def _callback(
|
|
|
211
254
|
assert n_rows is None
|
|
212
255
|
if timer is not None:
|
|
213
256
|
assert should_time
|
|
257
|
+
|
|
214
258
|
with (
|
|
215
259
|
nvtx.annotate(message="ExecuteIR", domain=CUDF_POLARS_NVTX_DOMAIN),
|
|
216
260
|
# Device must be set before memory resource is obtained.
|
|
217
261
|
set_device(config_options.device),
|
|
218
|
-
set_memory_resource(memory_resource),
|
|
262
|
+
set_memory_resource(memory_resource, config_options.memory_resource_config),
|
|
219
263
|
):
|
|
220
264
|
if config_options.executor.name == "in-memory":
|
|
221
|
-
|
|
265
|
+
context = IRExecutionContext.from_config_options(config_options)
|
|
266
|
+
df = ir.evaluate(cache={}, timer=timer, context=context).to_polars()
|
|
222
267
|
if timer is None:
|
|
223
268
|
return df
|
|
224
269
|
else:
|
|
@@ -236,7 +281,7 @@ def _callback(
|
|
|
236
281
|
""")
|
|
237
282
|
raise NotImplementedError(msg)
|
|
238
283
|
|
|
239
|
-
return evaluate_streaming(ir, config_options)
|
|
284
|
+
return evaluate_streaming(ir, config_options)
|
|
240
285
|
assert_never(f"Unknown executor '{config_options.executor}'")
|
|
241
286
|
|
|
242
287
|
|
|
@@ -287,7 +332,7 @@ def execute_with_cudf(
|
|
|
287
332
|
if (
|
|
288
333
|
memory_resource is None
|
|
289
334
|
and translator.config_options.executor.name == "streaming"
|
|
290
|
-
and translator.config_options.executor.
|
|
335
|
+
and translator.config_options.executor.cluster == "distributed"
|
|
291
336
|
): # pragma: no cover; Requires distributed cluster
|
|
292
337
|
memory_resource = rmm.mr.get_current_device_resource()
|
|
293
338
|
if len(ir_translation_errors):
|
cudf_polars/containers/column.py
CHANGED
|
@@ -5,11 +5,8 @@
|
|
|
5
5
|
|
|
6
6
|
from __future__ import annotations
|
|
7
7
|
|
|
8
|
-
import functools
|
|
9
8
|
from typing import TYPE_CHECKING
|
|
10
9
|
|
|
11
|
-
import polars as pl
|
|
12
|
-
import polars.datatypes.convert
|
|
13
10
|
from polars.exceptions import InvalidOperationError
|
|
14
11
|
|
|
15
12
|
import pylibcudf as plc
|
|
@@ -19,15 +16,21 @@ from pylibcudf.strings.convert.convert_integers import (
|
|
|
19
16
|
is_integer,
|
|
20
17
|
to_integers,
|
|
21
18
|
)
|
|
22
|
-
from pylibcudf.traits import is_floating_point
|
|
23
19
|
|
|
24
20
|
from cudf_polars.containers import DataType
|
|
21
|
+
from cudf_polars.containers.datatype import _dtype_from_header, _dtype_to_header
|
|
25
22
|
from cudf_polars.utils import conversion
|
|
26
23
|
from cudf_polars.utils.dtypes import is_order_preserving_cast
|
|
27
24
|
|
|
28
25
|
if TYPE_CHECKING:
|
|
26
|
+
from collections.abc import Callable
|
|
27
|
+
|
|
29
28
|
from typing_extensions import Self
|
|
30
29
|
|
|
30
|
+
from polars import Series as pl_Series
|
|
31
|
+
|
|
32
|
+
from rmm.pylibrmm.stream import Stream
|
|
33
|
+
|
|
31
34
|
from cudf_polars.typing import (
|
|
32
35
|
ColumnHeader,
|
|
33
36
|
ColumnOptions,
|
|
@@ -38,22 +41,6 @@ if TYPE_CHECKING:
|
|
|
38
41
|
__all__: list[str] = ["Column"]
|
|
39
42
|
|
|
40
43
|
|
|
41
|
-
def _dtype_short_repr_to_dtype(dtype_str: str) -> pl.DataType:
|
|
42
|
-
"""Convert a Polars dtype short repr to a Polars dtype."""
|
|
43
|
-
# limitations of dtype_short_repr_to_dtype described in
|
|
44
|
-
# py-polars/polars/datatypes/convert.py#L299
|
|
45
|
-
if dtype_str.startswith("list["):
|
|
46
|
-
stripped = dtype_str.removeprefix("list[").removesuffix("]")
|
|
47
|
-
return pl.List(_dtype_short_repr_to_dtype(stripped))
|
|
48
|
-
pl_type = polars.datatypes.convert.dtype_short_repr_to_dtype(dtype_str)
|
|
49
|
-
if pl_type is None:
|
|
50
|
-
raise ValueError(f"{dtype_str} was not able to be parsed by Polars.")
|
|
51
|
-
if isinstance(pl_type, polars.datatypes.DataTypeClass):
|
|
52
|
-
return pl_type()
|
|
53
|
-
else:
|
|
54
|
-
return pl_type
|
|
55
|
-
|
|
56
|
-
|
|
57
44
|
class Column:
|
|
58
45
|
"""An immutable column with sortedness metadata."""
|
|
59
46
|
|
|
@@ -85,7 +72,10 @@ class Column:
|
|
|
85
72
|
|
|
86
73
|
@classmethod
|
|
87
74
|
def deserialize(
|
|
88
|
-
cls,
|
|
75
|
+
cls,
|
|
76
|
+
header: ColumnHeader,
|
|
77
|
+
frames: tuple[memoryview[bytes], plc.gpumemoryview],
|
|
78
|
+
stream: Stream,
|
|
89
79
|
) -> Self:
|
|
90
80
|
"""
|
|
91
81
|
Create a Column from a serialized representation returned by `.serialize()`.
|
|
@@ -96,6 +86,10 @@ class Column:
|
|
|
96
86
|
The (unpickled) metadata required to reconstruct the object.
|
|
97
87
|
frames
|
|
98
88
|
Two-tuple of frames (a memoryview and a gpumemoryview).
|
|
89
|
+
stream
|
|
90
|
+
CUDA stream used for device memory operations and kernel launches
|
|
91
|
+
on this column. The caller is responsible for ensuring that
|
|
92
|
+
the data in ``frames`` is valid on ``stream``.
|
|
99
93
|
|
|
100
94
|
Returns
|
|
101
95
|
-------
|
|
@@ -104,7 +98,7 @@ class Column:
|
|
|
104
98
|
"""
|
|
105
99
|
packed_metadata, packed_gpu_data = frames
|
|
106
100
|
(plc_column,) = plc.contiguous_split.unpack_from_memoryviews(
|
|
107
|
-
packed_metadata, packed_gpu_data
|
|
101
|
+
packed_metadata, packed_gpu_data, stream
|
|
108
102
|
).columns()
|
|
109
103
|
return cls(plc_column, **cls.deserialize_ctor_kwargs(header["column_kwargs"]))
|
|
110
104
|
|
|
@@ -113,20 +107,18 @@ class Column:
|
|
|
113
107
|
column_kwargs: ColumnOptions,
|
|
114
108
|
) -> DeserializedColumnOptions:
|
|
115
109
|
"""Deserialize the constructor kwargs for a Column."""
|
|
116
|
-
dtype = DataType( # pragma: no cover
|
|
117
|
-
_dtype_short_repr_to_dtype(column_kwargs["dtype"])
|
|
118
|
-
)
|
|
119
110
|
return {
|
|
120
111
|
"is_sorted": column_kwargs["is_sorted"],
|
|
121
112
|
"order": column_kwargs["order"],
|
|
122
113
|
"null_order": column_kwargs["null_order"],
|
|
123
114
|
"name": column_kwargs["name"],
|
|
124
|
-
"dtype": dtype,
|
|
115
|
+
"dtype": DataType(_dtype_from_header(column_kwargs["dtype"])),
|
|
125
116
|
}
|
|
126
117
|
|
|
127
118
|
def serialize(
|
|
128
119
|
self,
|
|
129
|
-
|
|
120
|
+
stream: Stream,
|
|
121
|
+
) -> tuple[ColumnHeader, tuple[memoryview[bytes], plc.gpumemoryview]]:
|
|
130
122
|
"""
|
|
131
123
|
Serialize the Column into header and frames.
|
|
132
124
|
|
|
@@ -145,7 +137,7 @@ class Column:
|
|
|
145
137
|
frames
|
|
146
138
|
Two-tuple of frames suitable for passing to `plc.contiguous_split.unpack_from_memoryviews`
|
|
147
139
|
"""
|
|
148
|
-
packed = plc.contiguous_split.pack(plc.Table([self.obj]))
|
|
140
|
+
packed = plc.contiguous_split.pack(plc.Table([self.obj]), stream=stream)
|
|
149
141
|
header: ColumnHeader = {
|
|
150
142
|
"column_kwargs": self.serialize_ctor_kwargs(),
|
|
151
143
|
"frame_count": 2,
|
|
@@ -159,14 +151,20 @@ class Column:
|
|
|
159
151
|
"order": self.order,
|
|
160
152
|
"null_order": self.null_order,
|
|
161
153
|
"name": self.name,
|
|
162
|
-
"dtype":
|
|
154
|
+
"dtype": _dtype_to_header(self.dtype.polars_type),
|
|
163
155
|
}
|
|
164
156
|
|
|
165
|
-
|
|
166
|
-
def obj_scalar(self) -> plc.Scalar:
|
|
157
|
+
def obj_scalar(self, stream: Stream) -> plc.Scalar:
|
|
167
158
|
"""
|
|
168
159
|
A copy of the column object as a pylibcudf Scalar.
|
|
169
160
|
|
|
161
|
+
Parameters
|
|
162
|
+
----------
|
|
163
|
+
stream
|
|
164
|
+
CUDA stream used for device memory operations and kernel launches.
|
|
165
|
+
``self.obj`` must be valid on this stream, and the result will be
|
|
166
|
+
valid on this stream.
|
|
167
|
+
|
|
170
168
|
Returns
|
|
171
169
|
-------
|
|
172
170
|
pylibcudf Scalar object.
|
|
@@ -178,7 +176,7 @@ class Column:
|
|
|
178
176
|
"""
|
|
179
177
|
if not self.is_scalar:
|
|
180
178
|
raise ValueError(f"Cannot convert a column of length {self.size} to scalar")
|
|
181
|
-
return plc.copying.get_element(self.obj, 0)
|
|
179
|
+
return plc.copying.get_element(self.obj, 0, stream=stream)
|
|
182
180
|
|
|
183
181
|
def rename(self, name: str | None, /) -> Self:
|
|
184
182
|
"""
|
|
@@ -228,6 +226,7 @@ class Column:
|
|
|
228
226
|
*,
|
|
229
227
|
order: plc.types.Order,
|
|
230
228
|
null_order: plc.types.NullOrder,
|
|
229
|
+
stream: Stream,
|
|
231
230
|
) -> bool:
|
|
232
231
|
"""
|
|
233
232
|
Check if the column is sorted.
|
|
@@ -238,6 +237,9 @@ class Column:
|
|
|
238
237
|
The requested sort order.
|
|
239
238
|
null_order
|
|
240
239
|
Where nulls sort to.
|
|
240
|
+
stream
|
|
241
|
+
CUDA stream used for device memory operations and kernel launches
|
|
242
|
+
on this Column. The data in ``self.obj`` must be valid on this stream.
|
|
241
243
|
|
|
242
244
|
Returns
|
|
243
245
|
-------
|
|
@@ -254,14 +256,16 @@ class Column:
|
|
|
254
256
|
return self.order == order and (
|
|
255
257
|
self.null_count == 0 or self.null_order == null_order
|
|
256
258
|
)
|
|
257
|
-
if plc.sorting.is_sorted(
|
|
259
|
+
if plc.sorting.is_sorted(
|
|
260
|
+
plc.Table([self.obj]), [order], [null_order], stream=stream
|
|
261
|
+
):
|
|
258
262
|
self.sorted = plc.types.Sorted.YES
|
|
259
263
|
self.order = order
|
|
260
264
|
self.null_order = null_order
|
|
261
265
|
return True
|
|
262
266
|
return False
|
|
263
267
|
|
|
264
|
-
def astype(self, dtype: DataType) -> Column:
|
|
268
|
+
def astype(self, dtype: DataType, stream: Stream, *, strict: bool = True) -> Column:
|
|
265
269
|
"""
|
|
266
270
|
Cast the column to as the requested dtype.
|
|
267
271
|
|
|
@@ -269,6 +273,12 @@ class Column:
|
|
|
269
273
|
----------
|
|
270
274
|
dtype
|
|
271
275
|
Datatype to cast to.
|
|
276
|
+
stream
|
|
277
|
+
CUDA stream used for device memory operations and kernel launches
|
|
278
|
+
on this Column. The data in ``self.obj`` must be valid on this stream.
|
|
279
|
+
strict
|
|
280
|
+
If True, raise an error if the cast is unsupported.
|
|
281
|
+
If False, return nulls for unsupported casts.
|
|
272
282
|
|
|
273
283
|
Returns
|
|
274
284
|
-------
|
|
@@ -284,7 +294,7 @@ class Column:
|
|
|
284
294
|
This only produces a copy if the requested dtype doesn't match
|
|
285
295
|
the current one.
|
|
286
296
|
"""
|
|
287
|
-
plc_dtype = dtype.
|
|
297
|
+
plc_dtype = dtype.plc_type
|
|
288
298
|
if self.obj.type() == plc_dtype:
|
|
289
299
|
return self
|
|
290
300
|
|
|
@@ -292,12 +302,17 @@ class Column:
|
|
|
292
302
|
plc_dtype.id() == plc.TypeId.STRING
|
|
293
303
|
or self.obj.type().id() == plc.TypeId.STRING
|
|
294
304
|
):
|
|
295
|
-
return Column(
|
|
305
|
+
return Column(
|
|
306
|
+
self._handle_string_cast(plc_dtype, stream=stream, strict=strict),
|
|
307
|
+
dtype=dtype,
|
|
308
|
+
)
|
|
296
309
|
elif plc.traits.is_integral_not_bool(
|
|
297
310
|
self.obj.type()
|
|
298
311
|
) and plc.traits.is_timestamp(plc_dtype):
|
|
299
|
-
upcasted = plc.unary.cast(
|
|
300
|
-
|
|
312
|
+
upcasted = plc.unary.cast(
|
|
313
|
+
self.obj, plc.DataType(plc.TypeId.INT64), stream=stream
|
|
314
|
+
)
|
|
315
|
+
plc_col = plc.column.Column(
|
|
301
316
|
plc_dtype,
|
|
302
317
|
upcasted.size(),
|
|
303
318
|
upcasted.data(),
|
|
@@ -306,11 +321,11 @@ class Column:
|
|
|
306
321
|
upcasted.offset(),
|
|
307
322
|
upcasted.children(),
|
|
308
323
|
)
|
|
309
|
-
return Column(
|
|
324
|
+
return Column(plc_col, dtype=dtype).sorted_like(self)
|
|
310
325
|
elif plc.traits.is_integral_not_bool(plc_dtype) and plc.traits.is_timestamp(
|
|
311
326
|
self.obj.type()
|
|
312
327
|
):
|
|
313
|
-
|
|
328
|
+
plc_col = plc.column.Column(
|
|
314
329
|
plc.DataType(plc.TypeId.INT64),
|
|
315
330
|
self.obj.size(),
|
|
316
331
|
self.obj.data(),
|
|
@@ -319,42 +334,66 @@ class Column:
|
|
|
319
334
|
self.obj.offset(),
|
|
320
335
|
self.obj.children(),
|
|
321
336
|
)
|
|
322
|
-
return Column(
|
|
323
|
-
|
|
324
|
-
)
|
|
337
|
+
return Column(
|
|
338
|
+
plc.unary.cast(plc_col, plc_dtype, stream=stream), dtype=dtype
|
|
339
|
+
).sorted_like(self)
|
|
325
340
|
else:
|
|
326
|
-
result = Column(
|
|
341
|
+
result = Column(
|
|
342
|
+
plc.unary.cast(self.obj, plc_dtype, stream=stream), dtype=dtype
|
|
343
|
+
)
|
|
327
344
|
if is_order_preserving_cast(self.obj.type(), plc_dtype):
|
|
328
345
|
return result.sorted_like(self)
|
|
329
346
|
return result
|
|
330
347
|
|
|
331
|
-
def _handle_string_cast(
|
|
348
|
+
def _handle_string_cast(
|
|
349
|
+
self, dtype: plc.DataType, stream: Stream, *, strict: bool
|
|
350
|
+
) -> plc.Column:
|
|
332
351
|
if dtype.id() == plc.TypeId.STRING:
|
|
333
|
-
if is_floating_point(self.obj.type()):
|
|
334
|
-
return from_floats(self.obj)
|
|
352
|
+
if plc.traits.is_floating_point(self.obj.type()):
|
|
353
|
+
return from_floats(self.obj, stream=stream)
|
|
354
|
+
elif plc.traits.is_integral_not_bool(self.obj.type()):
|
|
355
|
+
return from_integers(self.obj, stream=stream)
|
|
335
356
|
else:
|
|
336
|
-
|
|
357
|
+
raise InvalidOperationError(
|
|
358
|
+
f"Unsupported casting from {self.dtype.id()} to {dtype.id()}."
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
type_checker: Callable[[plc.Column, Stream], plc.Column]
|
|
362
|
+
type_caster: Callable[[plc.Column, plc.DataType, Stream], plc.Column]
|
|
363
|
+
if plc.traits.is_floating_point(dtype):
|
|
364
|
+
type_checker = is_float
|
|
365
|
+
type_caster = to_floats
|
|
366
|
+
elif plc.traits.is_integral_not_bool(dtype):
|
|
367
|
+
# is_integer has a second optional int_type: plc.DataType | None = None argument
|
|
368
|
+
# we do not use
|
|
369
|
+
# unused-ignore for if RMM is missing
|
|
370
|
+
type_checker = is_integer # type: ignore[assignment,unused-ignore]
|
|
371
|
+
type_caster = to_integers
|
|
337
372
|
else:
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
373
|
+
raise InvalidOperationError(
|
|
374
|
+
f"Unsupported casting from {self.dtype.id()} to {dtype.id()}."
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
castable = type_checker(self.obj, stream=stream) # type: ignore[call-arg]
|
|
378
|
+
if not plc.reduce.reduce(
|
|
379
|
+
castable,
|
|
380
|
+
plc.aggregation.all(),
|
|
381
|
+
plc.DataType(plc.TypeId.BOOL8),
|
|
382
|
+
stream=stream,
|
|
383
|
+
).to_py(stream=stream):
|
|
384
|
+
if strict:
|
|
385
|
+
raise InvalidOperationError(
|
|
386
|
+
f"Conversion from {self.dtype.id()} to {dtype.id()} failed."
|
|
387
|
+
)
|
|
347
388
|
else:
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
def copy_metadata(self, from_: pl.Series, /) -> Self:
|
|
389
|
+
values = self.obj.with_mask(
|
|
390
|
+
*plc.transform.bools_to_mask(castable, stream=stream)
|
|
391
|
+
)
|
|
392
|
+
else:
|
|
393
|
+
values = self.obj
|
|
394
|
+
return type_caster(values, dtype, stream=stream)
|
|
395
|
+
|
|
396
|
+
def copy_metadata(self, from_: pl_Series, /) -> Self:
|
|
358
397
|
"""
|
|
359
398
|
Copy metadata from a host series onto self.
|
|
360
399
|
|
|
@@ -439,27 +478,44 @@ class Column:
|
|
|
439
478
|
dtype=self.dtype,
|
|
440
479
|
)
|
|
441
480
|
|
|
442
|
-
def mask_nans(self) -> Self:
|
|
481
|
+
def mask_nans(self, stream: Stream) -> Self:
|
|
443
482
|
"""Return a shallow copy of self with nans masked out."""
|
|
444
483
|
if plc.traits.is_floating_point(self.obj.type()):
|
|
445
484
|
old_count = self.null_count
|
|
446
|
-
mask, new_count = plc.transform.nans_to_nulls(self.obj)
|
|
485
|
+
mask, new_count = plc.transform.nans_to_nulls(self.obj, stream=stream)
|
|
447
486
|
result = type(self)(self.obj.with_mask(mask, new_count), self.dtype)
|
|
448
487
|
if old_count == new_count:
|
|
449
488
|
return result.sorted_like(self)
|
|
450
489
|
return result
|
|
451
490
|
return self.copy()
|
|
452
491
|
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
492
|
+
def nan_count(self, stream: Stream) -> int:
|
|
493
|
+
"""
|
|
494
|
+
Return the number of NaN values in the column.
|
|
495
|
+
|
|
496
|
+
Parameters
|
|
497
|
+
----------
|
|
498
|
+
stream
|
|
499
|
+
CUDA stream used for device memory operations and kernel launches.
|
|
500
|
+
``self.obj`` must be valid on this stream, and the result will be
|
|
501
|
+
valid on this stream.
|
|
502
|
+
|
|
503
|
+
Returns
|
|
504
|
+
-------
|
|
505
|
+
Number of NaN values in the column.
|
|
506
|
+
"""
|
|
507
|
+
result: int
|
|
456
508
|
if self.size > 0 and plc.traits.is_floating_point(self.obj.type()):
|
|
457
|
-
|
|
458
|
-
|
|
509
|
+
# See https://github.com/rapidsai/cudf/issues/20202 for we type ignore
|
|
510
|
+
result = plc.reduce.reduce( # type: ignore[assignment]
|
|
511
|
+
plc.unary.is_nan(self.obj, stream=stream),
|
|
459
512
|
plc.aggregation.sum(),
|
|
460
513
|
plc.types.SIZE_TYPE,
|
|
461
|
-
|
|
462
|
-
|
|
514
|
+
stream=stream,
|
|
515
|
+
).to_py(stream=stream)
|
|
516
|
+
else:
|
|
517
|
+
result = 0
|
|
518
|
+
return result
|
|
463
519
|
|
|
464
520
|
@property
|
|
465
521
|
def size(self) -> int:
|
|
@@ -471,7 +527,7 @@ class Column:
|
|
|
471
527
|
"""Return the number of Null values in the column."""
|
|
472
528
|
return self.obj.null_count()
|
|
473
529
|
|
|
474
|
-
def slice(self, zlice: Slice | None) -> Self:
|
|
530
|
+
def slice(self, zlice: Slice | None, stream: Stream) -> Self:
|
|
475
531
|
"""
|
|
476
532
|
Slice a column.
|
|
477
533
|
|
|
@@ -480,6 +536,9 @@ class Column:
|
|
|
480
536
|
zlice
|
|
481
537
|
optional, tuple of start and length, negative values of start
|
|
482
538
|
treated as for python indexing. If not provided, returns self.
|
|
539
|
+
stream
|
|
540
|
+
CUDA stream used for device memory operations and kernel launches
|
|
541
|
+
on this Column. The data in ``self.obj`` must be valid on this stream.
|
|
483
542
|
|
|
484
543
|
Returns
|
|
485
544
|
-------
|
|
@@ -490,6 +549,7 @@ class Column:
|
|
|
490
549
|
(table,) = plc.copying.slice(
|
|
491
550
|
plc.Table([self.obj]),
|
|
492
551
|
conversion.from_polars_slice(zlice, num_rows=self.size),
|
|
552
|
+
stream=stream,
|
|
493
553
|
)
|
|
494
554
|
(column,) = table.columns()
|
|
495
555
|
return type(self)(column, name=self.name, dtype=self.dtype).sorted_like(self)
|