cudf-polars-cu12 25.10.0__py3-none-any.whl → 25.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudf_polars/GIT_COMMIT +1 -1
- cudf_polars/VERSION +1 -1
- cudf_polars/callback.py +32 -8
- cudf_polars/containers/column.py +94 -59
- cudf_polars/containers/dataframe.py +123 -34
- cudf_polars/containers/datatype.py +134 -13
- cudf_polars/dsl/expr.py +0 -2
- cudf_polars/dsl/expressions/aggregation.py +80 -28
- cudf_polars/dsl/expressions/binaryop.py +34 -14
- cudf_polars/dsl/expressions/boolean.py +110 -37
- cudf_polars/dsl/expressions/datetime.py +59 -30
- cudf_polars/dsl/expressions/literal.py +11 -5
- cudf_polars/dsl/expressions/rolling.py +460 -119
- cudf_polars/dsl/expressions/selection.py +9 -8
- cudf_polars/dsl/expressions/slicing.py +1 -1
- cudf_polars/dsl/expressions/string.py +235 -102
- cudf_polars/dsl/expressions/struct.py +19 -7
- cudf_polars/dsl/expressions/ternary.py +9 -3
- cudf_polars/dsl/expressions/unary.py +117 -58
- cudf_polars/dsl/ir.py +923 -290
- cudf_polars/dsl/to_ast.py +30 -13
- cudf_polars/dsl/tracing.py +194 -0
- cudf_polars/dsl/translate.py +294 -97
- cudf_polars/dsl/utils/aggregations.py +34 -26
- cudf_polars/dsl/utils/reshape.py +14 -2
- cudf_polars/dsl/utils/rolling.py +12 -8
- cudf_polars/dsl/utils/windows.py +35 -20
- cudf_polars/experimental/base.py +45 -2
- cudf_polars/experimental/benchmarks/pdsds.py +12 -126
- cudf_polars/experimental/benchmarks/pdsh.py +791 -1
- cudf_polars/experimental/benchmarks/utils.py +515 -39
- cudf_polars/experimental/dask_registers.py +47 -20
- cudf_polars/experimental/dispatch.py +9 -3
- cudf_polars/experimental/explain.py +15 -2
- cudf_polars/experimental/expressions.py +22 -10
- cudf_polars/experimental/groupby.py +23 -4
- cudf_polars/experimental/io.py +93 -83
- cudf_polars/experimental/join.py +39 -22
- cudf_polars/experimental/parallel.py +60 -14
- cudf_polars/experimental/rapidsmpf/__init__.py +8 -0
- cudf_polars/experimental/rapidsmpf/core.py +361 -0
- cudf_polars/experimental/rapidsmpf/dispatch.py +150 -0
- cudf_polars/experimental/rapidsmpf/io.py +604 -0
- cudf_polars/experimental/rapidsmpf/join.py +237 -0
- cudf_polars/experimental/rapidsmpf/lower.py +74 -0
- cudf_polars/experimental/rapidsmpf/nodes.py +494 -0
- cudf_polars/experimental/rapidsmpf/repartition.py +151 -0
- cudf_polars/experimental/rapidsmpf/shuffle.py +277 -0
- cudf_polars/experimental/rapidsmpf/union.py +96 -0
- cudf_polars/experimental/rapidsmpf/utils.py +162 -0
- cudf_polars/experimental/repartition.py +9 -2
- cudf_polars/experimental/select.py +177 -14
- cudf_polars/experimental/shuffle.py +28 -8
- cudf_polars/experimental/sort.py +92 -25
- cudf_polars/experimental/statistics.py +24 -5
- cudf_polars/experimental/utils.py +25 -7
- cudf_polars/testing/asserts.py +13 -8
- cudf_polars/testing/io.py +2 -1
- cudf_polars/testing/plugin.py +88 -15
- cudf_polars/typing/__init__.py +86 -32
- cudf_polars/utils/config.py +406 -58
- cudf_polars/utils/cuda_stream.py +70 -0
- cudf_polars/utils/versions.py +3 -2
- cudf_polars_cu12-25.12.0.dist-info/METADATA +182 -0
- cudf_polars_cu12-25.12.0.dist-info/RECORD +104 -0
- cudf_polars_cu12-25.10.0.dist-info/METADATA +0 -136
- cudf_polars_cu12-25.10.0.dist-info/RECORD +0 -92
- {cudf_polars_cu12-25.10.0.dist-info → cudf_polars_cu12-25.12.0.dist-info}/WHEEL +0 -0
- {cudf_polars_cu12-25.10.0.dist-info → cudf_polars_cu12-25.12.0.dist-info}/licenses/LICENSE +0 -0
- {cudf_polars_cu12-25.10.0.dist-info → cudf_polars_cu12-25.12.0.dist-info}/top_level.txt +0 -0
cudf_polars/GIT_COMMIT
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
|
|
1
|
+
580975be72b3516c2c18da149b62de557b28fb67
|
cudf_polars/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
25.
|
|
1
|
+
25.12.00
|
cudf_polars/callback.py
CHANGED
|
@@ -22,9 +22,14 @@ import pylibcudf
|
|
|
22
22
|
import rmm
|
|
23
23
|
from rmm._cuda import gpu
|
|
24
24
|
|
|
25
|
+
import cudf_polars.dsl.tracing
|
|
26
|
+
from cudf_polars.dsl.ir import IRExecutionContext
|
|
25
27
|
from cudf_polars.dsl.tracing import CUDF_POLARS_NVTX_DOMAIN
|
|
26
28
|
from cudf_polars.dsl.translate import Translator
|
|
27
|
-
from cudf_polars.utils.config import
|
|
29
|
+
from cudf_polars.utils.config import (
|
|
30
|
+
_env_get_int,
|
|
31
|
+
get_total_device_memory,
|
|
32
|
+
)
|
|
28
33
|
from cudf_polars.utils.timer import Timer
|
|
29
34
|
|
|
30
35
|
if TYPE_CHECKING:
|
|
@@ -35,7 +40,7 @@ if TYPE_CHECKING:
|
|
|
35
40
|
|
|
36
41
|
from cudf_polars.dsl.ir import IR
|
|
37
42
|
from cudf_polars.typing import NodeTraverser
|
|
38
|
-
from cudf_polars.utils.config import ConfigOptions
|
|
43
|
+
from cudf_polars.utils.config import ConfigOptions, MemoryResourceConfig
|
|
39
44
|
|
|
40
45
|
__all__: list[str] = ["execute_with_cudf"]
|
|
41
46
|
|
|
@@ -44,6 +49,7 @@ __all__: list[str] = ["execute_with_cudf"]
|
|
|
44
49
|
def default_memory_resource(
|
|
45
50
|
device: int,
|
|
46
51
|
cuda_managed_memory: bool, # noqa: FBT001
|
|
52
|
+
memory_resource_config: MemoryResourceConfig | None,
|
|
47
53
|
) -> rmm.mr.DeviceMemoryResource:
|
|
48
54
|
"""
|
|
49
55
|
Return the default memory resource for cudf-polars.
|
|
@@ -55,6 +61,9 @@ def default_memory_resource(
|
|
|
55
61
|
the active device when this function is called.
|
|
56
62
|
cuda_managed_memory
|
|
57
63
|
Whether to use managed memory or not.
|
|
64
|
+
memory_resource_config
|
|
65
|
+
Memory resource configuration to use. If ``None``, the default
|
|
66
|
+
memory resource is used.
|
|
58
67
|
|
|
59
68
|
Returns
|
|
60
69
|
-------
|
|
@@ -64,7 +73,9 @@ def default_memory_resource(
|
|
|
64
73
|
else, an async pool resource is returned.
|
|
65
74
|
"""
|
|
66
75
|
try:
|
|
67
|
-
if
|
|
76
|
+
if memory_resource_config is not None:
|
|
77
|
+
mr = memory_resource_config.create_memory_resource()
|
|
78
|
+
elif (
|
|
68
79
|
cuda_managed_memory
|
|
69
80
|
and pylibcudf.utils._is_concurrent_managed_access_supported()
|
|
70
81
|
):
|
|
@@ -89,7 +100,7 @@ def default_memory_resource(
|
|
|
89
100
|
):
|
|
90
101
|
raise ComputeError(
|
|
91
102
|
"GPU engine requested, but incorrect cudf-polars package installed. "
|
|
92
|
-
"cudf-polars requires CUDA 12.
|
|
103
|
+
"cudf-polars requires CUDA 12.2+ to installed."
|
|
93
104
|
) from None
|
|
94
105
|
else:
|
|
95
106
|
raise
|
|
@@ -100,6 +111,7 @@ def default_memory_resource(
|
|
|
100
111
|
@contextlib.contextmanager
|
|
101
112
|
def set_memory_resource(
|
|
102
113
|
mr: rmm.mr.DeviceMemoryResource | None,
|
|
114
|
+
memory_resource_config: MemoryResourceConfig | None,
|
|
103
115
|
) -> Generator[rmm.mr.DeviceMemoryResource, None, None]:
|
|
104
116
|
"""
|
|
105
117
|
Set the current memory resource for an execution block.
|
|
@@ -109,6 +121,9 @@ def set_memory_resource(
|
|
|
109
121
|
mr
|
|
110
122
|
Memory resource to use. If `None`, calls :func:`default_memory_resource`
|
|
111
123
|
to obtain an mr on the currently active device.
|
|
124
|
+
memory_resource_config
|
|
125
|
+
Memory resource configuration to use when a concrete memory resource.
|
|
126
|
+
is not provided. If ``None``, the default memory resource is used.
|
|
112
127
|
|
|
113
128
|
Returns
|
|
114
129
|
-------
|
|
@@ -132,7 +147,14 @@ def set_memory_resource(
|
|
|
132
147
|
)
|
|
133
148
|
!= 0
|
|
134
149
|
),
|
|
150
|
+
memory_resource_config=memory_resource_config,
|
|
135
151
|
)
|
|
152
|
+
|
|
153
|
+
if (
|
|
154
|
+
cudf_polars.dsl.tracing.LOG_TRACES
|
|
155
|
+
): # pragma: no cover; requires CUDF_POLARS_LOG_TRACES=1
|
|
156
|
+
mr = rmm.mr.StatisticsResourceAdaptor(mr)
|
|
157
|
+
|
|
136
158
|
rmm.mr.set_current_device_resource(mr)
|
|
137
159
|
try:
|
|
138
160
|
yield mr
|
|
@@ -211,14 +233,16 @@ def _callback(
|
|
|
211
233
|
assert n_rows is None
|
|
212
234
|
if timer is not None:
|
|
213
235
|
assert should_time
|
|
236
|
+
|
|
214
237
|
with (
|
|
215
238
|
nvtx.annotate(message="ExecuteIR", domain=CUDF_POLARS_NVTX_DOMAIN),
|
|
216
239
|
# Device must be set before memory resource is obtained.
|
|
217
240
|
set_device(config_options.device),
|
|
218
|
-
set_memory_resource(memory_resource),
|
|
241
|
+
set_memory_resource(memory_resource, config_options.memory_resource_config),
|
|
219
242
|
):
|
|
220
243
|
if config_options.executor.name == "in-memory":
|
|
221
|
-
|
|
244
|
+
context = IRExecutionContext.from_config_options(config_options)
|
|
245
|
+
df = ir.evaluate(cache={}, timer=timer, context=context).to_polars()
|
|
222
246
|
if timer is None:
|
|
223
247
|
return df
|
|
224
248
|
else:
|
|
@@ -236,7 +260,7 @@ def _callback(
|
|
|
236
260
|
""")
|
|
237
261
|
raise NotImplementedError(msg)
|
|
238
262
|
|
|
239
|
-
return evaluate_streaming(ir, config_options)
|
|
263
|
+
return evaluate_streaming(ir, config_options)
|
|
240
264
|
assert_never(f"Unknown executor '{config_options.executor}'")
|
|
241
265
|
|
|
242
266
|
|
|
@@ -287,7 +311,7 @@ def execute_with_cudf(
|
|
|
287
311
|
if (
|
|
288
312
|
memory_resource is None
|
|
289
313
|
and translator.config_options.executor.name == "streaming"
|
|
290
|
-
and translator.config_options.executor.
|
|
314
|
+
and translator.config_options.executor.cluster == "distributed"
|
|
291
315
|
): # pragma: no cover; Requires distributed cluster
|
|
292
316
|
memory_resource = rmm.mr.get_current_device_resource()
|
|
293
317
|
if len(ir_translation_errors):
|
cudf_polars/containers/column.py
CHANGED
|
@@ -5,11 +5,8 @@
|
|
|
5
5
|
|
|
6
6
|
from __future__ import annotations
|
|
7
7
|
|
|
8
|
-
import functools
|
|
9
8
|
from typing import TYPE_CHECKING
|
|
10
9
|
|
|
11
|
-
import polars as pl
|
|
12
|
-
import polars.datatypes.convert
|
|
13
10
|
from polars.exceptions import InvalidOperationError
|
|
14
11
|
|
|
15
12
|
import pylibcudf as plc
|
|
@@ -22,12 +19,17 @@ from pylibcudf.strings.convert.convert_integers import (
|
|
|
22
19
|
from pylibcudf.traits import is_floating_point
|
|
23
20
|
|
|
24
21
|
from cudf_polars.containers import DataType
|
|
22
|
+
from cudf_polars.containers.datatype import _dtype_from_header, _dtype_to_header
|
|
25
23
|
from cudf_polars.utils import conversion
|
|
26
24
|
from cudf_polars.utils.dtypes import is_order_preserving_cast
|
|
27
25
|
|
|
28
26
|
if TYPE_CHECKING:
|
|
29
27
|
from typing_extensions import Self
|
|
30
28
|
|
|
29
|
+
from polars import Series as pl_Series
|
|
30
|
+
|
|
31
|
+
from rmm.pylibrmm.stream import Stream
|
|
32
|
+
|
|
31
33
|
from cudf_polars.typing import (
|
|
32
34
|
ColumnHeader,
|
|
33
35
|
ColumnOptions,
|
|
@@ -38,22 +40,6 @@ if TYPE_CHECKING:
|
|
|
38
40
|
__all__: list[str] = ["Column"]
|
|
39
41
|
|
|
40
42
|
|
|
41
|
-
def _dtype_short_repr_to_dtype(dtype_str: str) -> pl.DataType:
|
|
42
|
-
"""Convert a Polars dtype short repr to a Polars dtype."""
|
|
43
|
-
# limitations of dtype_short_repr_to_dtype described in
|
|
44
|
-
# py-polars/polars/datatypes/convert.py#L299
|
|
45
|
-
if dtype_str.startswith("list["):
|
|
46
|
-
stripped = dtype_str.removeprefix("list[").removesuffix("]")
|
|
47
|
-
return pl.List(_dtype_short_repr_to_dtype(stripped))
|
|
48
|
-
pl_type = polars.datatypes.convert.dtype_short_repr_to_dtype(dtype_str)
|
|
49
|
-
if pl_type is None:
|
|
50
|
-
raise ValueError(f"{dtype_str} was not able to be parsed by Polars.")
|
|
51
|
-
if isinstance(pl_type, polars.datatypes.DataTypeClass):
|
|
52
|
-
return pl_type()
|
|
53
|
-
else:
|
|
54
|
-
return pl_type
|
|
55
|
-
|
|
56
|
-
|
|
57
43
|
class Column:
|
|
58
44
|
"""An immutable column with sortedness metadata."""
|
|
59
45
|
|
|
@@ -85,7 +71,10 @@ class Column:
|
|
|
85
71
|
|
|
86
72
|
@classmethod
|
|
87
73
|
def deserialize(
|
|
88
|
-
cls,
|
|
74
|
+
cls,
|
|
75
|
+
header: ColumnHeader,
|
|
76
|
+
frames: tuple[memoryview[bytes], plc.gpumemoryview],
|
|
77
|
+
stream: Stream,
|
|
89
78
|
) -> Self:
|
|
90
79
|
"""
|
|
91
80
|
Create a Column from a serialized representation returned by `.serialize()`.
|
|
@@ -96,6 +85,10 @@ class Column:
|
|
|
96
85
|
The (unpickled) metadata required to reconstruct the object.
|
|
97
86
|
frames
|
|
98
87
|
Two-tuple of frames (a memoryview and a gpumemoryview).
|
|
88
|
+
stream
|
|
89
|
+
CUDA stream used for device memory operations and kernel launches
|
|
90
|
+
on this column. The caller is responsible for ensuring that
|
|
91
|
+
the data in ``frames`` is valid on ``stream``.
|
|
99
92
|
|
|
100
93
|
Returns
|
|
101
94
|
-------
|
|
@@ -104,7 +97,7 @@ class Column:
|
|
|
104
97
|
"""
|
|
105
98
|
packed_metadata, packed_gpu_data = frames
|
|
106
99
|
(plc_column,) = plc.contiguous_split.unpack_from_memoryviews(
|
|
107
|
-
packed_metadata, packed_gpu_data
|
|
100
|
+
packed_metadata, packed_gpu_data, stream
|
|
108
101
|
).columns()
|
|
109
102
|
return cls(plc_column, **cls.deserialize_ctor_kwargs(header["column_kwargs"]))
|
|
110
103
|
|
|
@@ -113,20 +106,18 @@ class Column:
|
|
|
113
106
|
column_kwargs: ColumnOptions,
|
|
114
107
|
) -> DeserializedColumnOptions:
|
|
115
108
|
"""Deserialize the constructor kwargs for a Column."""
|
|
116
|
-
dtype = DataType( # pragma: no cover
|
|
117
|
-
_dtype_short_repr_to_dtype(column_kwargs["dtype"])
|
|
118
|
-
)
|
|
119
109
|
return {
|
|
120
110
|
"is_sorted": column_kwargs["is_sorted"],
|
|
121
111
|
"order": column_kwargs["order"],
|
|
122
112
|
"null_order": column_kwargs["null_order"],
|
|
123
113
|
"name": column_kwargs["name"],
|
|
124
|
-
"dtype": dtype,
|
|
114
|
+
"dtype": DataType(_dtype_from_header(column_kwargs["dtype"])),
|
|
125
115
|
}
|
|
126
116
|
|
|
127
117
|
def serialize(
|
|
128
118
|
self,
|
|
129
|
-
|
|
119
|
+
stream: Stream,
|
|
120
|
+
) -> tuple[ColumnHeader, tuple[memoryview[bytes], plc.gpumemoryview]]:
|
|
130
121
|
"""
|
|
131
122
|
Serialize the Column into header and frames.
|
|
132
123
|
|
|
@@ -145,7 +136,7 @@ class Column:
|
|
|
145
136
|
frames
|
|
146
137
|
Two-tuple of frames suitable for passing to `plc.contiguous_split.unpack_from_memoryviews`
|
|
147
138
|
"""
|
|
148
|
-
packed = plc.contiguous_split.pack(plc.Table([self.obj]))
|
|
139
|
+
packed = plc.contiguous_split.pack(plc.Table([self.obj]), stream=stream)
|
|
149
140
|
header: ColumnHeader = {
|
|
150
141
|
"column_kwargs": self.serialize_ctor_kwargs(),
|
|
151
142
|
"frame_count": 2,
|
|
@@ -159,14 +150,20 @@ class Column:
|
|
|
159
150
|
"order": self.order,
|
|
160
151
|
"null_order": self.null_order,
|
|
161
152
|
"name": self.name,
|
|
162
|
-
"dtype":
|
|
153
|
+
"dtype": _dtype_to_header(self.dtype.polars_type),
|
|
163
154
|
}
|
|
164
155
|
|
|
165
|
-
|
|
166
|
-
def obj_scalar(self) -> plc.Scalar:
|
|
156
|
+
def obj_scalar(self, stream: Stream) -> plc.Scalar:
|
|
167
157
|
"""
|
|
168
158
|
A copy of the column object as a pylibcudf Scalar.
|
|
169
159
|
|
|
160
|
+
Parameters
|
|
161
|
+
----------
|
|
162
|
+
stream
|
|
163
|
+
CUDA stream used for device memory operations and kernel launches.
|
|
164
|
+
``self.obj`` must be valid on this stream, and the result will be
|
|
165
|
+
valid on this stream.
|
|
166
|
+
|
|
170
167
|
Returns
|
|
171
168
|
-------
|
|
172
169
|
pylibcudf Scalar object.
|
|
@@ -178,7 +175,7 @@ class Column:
|
|
|
178
175
|
"""
|
|
179
176
|
if not self.is_scalar:
|
|
180
177
|
raise ValueError(f"Cannot convert a column of length {self.size} to scalar")
|
|
181
|
-
return plc.copying.get_element(self.obj, 0)
|
|
178
|
+
return plc.copying.get_element(self.obj, 0, stream=stream)
|
|
182
179
|
|
|
183
180
|
def rename(self, name: str | None, /) -> Self:
|
|
184
181
|
"""
|
|
@@ -228,6 +225,7 @@ class Column:
|
|
|
228
225
|
*,
|
|
229
226
|
order: plc.types.Order,
|
|
230
227
|
null_order: plc.types.NullOrder,
|
|
228
|
+
stream: Stream,
|
|
231
229
|
) -> bool:
|
|
232
230
|
"""
|
|
233
231
|
Check if the column is sorted.
|
|
@@ -238,6 +236,9 @@ class Column:
|
|
|
238
236
|
The requested sort order.
|
|
239
237
|
null_order
|
|
240
238
|
Where nulls sort to.
|
|
239
|
+
stream
|
|
240
|
+
CUDA stream used for device memory operations and kernel launches
|
|
241
|
+
on this Column. The data in ``self.obj`` must be valid on this stream.
|
|
241
242
|
|
|
242
243
|
Returns
|
|
243
244
|
-------
|
|
@@ -254,14 +255,16 @@ class Column:
|
|
|
254
255
|
return self.order == order and (
|
|
255
256
|
self.null_count == 0 or self.null_order == null_order
|
|
256
257
|
)
|
|
257
|
-
if plc.sorting.is_sorted(
|
|
258
|
+
if plc.sorting.is_sorted(
|
|
259
|
+
plc.Table([self.obj]), [order], [null_order], stream=stream
|
|
260
|
+
):
|
|
258
261
|
self.sorted = plc.types.Sorted.YES
|
|
259
262
|
self.order = order
|
|
260
263
|
self.null_order = null_order
|
|
261
264
|
return True
|
|
262
265
|
return False
|
|
263
266
|
|
|
264
|
-
def astype(self, dtype: DataType) -> Column:
|
|
267
|
+
def astype(self, dtype: DataType, stream: Stream) -> Column:
|
|
265
268
|
"""
|
|
266
269
|
Cast the column to as the requested dtype.
|
|
267
270
|
|
|
@@ -269,6 +272,9 @@ class Column:
|
|
|
269
272
|
----------
|
|
270
273
|
dtype
|
|
271
274
|
Datatype to cast to.
|
|
275
|
+
stream
|
|
276
|
+
CUDA stream used for device memory operations and kernel launches
|
|
277
|
+
on this Column. The data in ``self.obj`` must be valid on this stream.
|
|
272
278
|
|
|
273
279
|
Returns
|
|
274
280
|
-------
|
|
@@ -284,7 +290,7 @@ class Column:
|
|
|
284
290
|
This only produces a copy if the requested dtype doesn't match
|
|
285
291
|
the current one.
|
|
286
292
|
"""
|
|
287
|
-
plc_dtype = dtype.
|
|
293
|
+
plc_dtype = dtype.plc_type
|
|
288
294
|
if self.obj.type() == plc_dtype:
|
|
289
295
|
return self
|
|
290
296
|
|
|
@@ -292,12 +298,16 @@ class Column:
|
|
|
292
298
|
plc_dtype.id() == plc.TypeId.STRING
|
|
293
299
|
or self.obj.type().id() == plc.TypeId.STRING
|
|
294
300
|
):
|
|
295
|
-
return Column(
|
|
301
|
+
return Column(
|
|
302
|
+
self._handle_string_cast(plc_dtype, stream=stream), dtype=dtype
|
|
303
|
+
)
|
|
296
304
|
elif plc.traits.is_integral_not_bool(
|
|
297
305
|
self.obj.type()
|
|
298
306
|
) and plc.traits.is_timestamp(plc_dtype):
|
|
299
|
-
upcasted = plc.unary.cast(
|
|
300
|
-
|
|
307
|
+
upcasted = plc.unary.cast(
|
|
308
|
+
self.obj, plc.DataType(plc.TypeId.INT64), stream=stream
|
|
309
|
+
)
|
|
310
|
+
plc_col = plc.column.Column(
|
|
301
311
|
plc_dtype,
|
|
302
312
|
upcasted.size(),
|
|
303
313
|
upcasted.data(),
|
|
@@ -306,11 +316,11 @@ class Column:
|
|
|
306
316
|
upcasted.offset(),
|
|
307
317
|
upcasted.children(),
|
|
308
318
|
)
|
|
309
|
-
return Column(
|
|
319
|
+
return Column(plc_col, dtype=dtype).sorted_like(self)
|
|
310
320
|
elif plc.traits.is_integral_not_bool(plc_dtype) and plc.traits.is_timestamp(
|
|
311
321
|
self.obj.type()
|
|
312
322
|
):
|
|
313
|
-
|
|
323
|
+
plc_col = plc.column.Column(
|
|
314
324
|
plc.DataType(plc.TypeId.INT64),
|
|
315
325
|
self.obj.size(),
|
|
316
326
|
self.obj.data(),
|
|
@@ -319,42 +329,46 @@ class Column:
|
|
|
319
329
|
self.obj.offset(),
|
|
320
330
|
self.obj.children(),
|
|
321
331
|
)
|
|
322
|
-
return Column(
|
|
323
|
-
|
|
324
|
-
)
|
|
332
|
+
return Column(
|
|
333
|
+
plc.unary.cast(plc_col, plc_dtype, stream=stream), dtype=dtype
|
|
334
|
+
).sorted_like(self)
|
|
325
335
|
else:
|
|
326
|
-
result = Column(
|
|
336
|
+
result = Column(
|
|
337
|
+
plc.unary.cast(self.obj, plc_dtype, stream=stream), dtype=dtype
|
|
338
|
+
)
|
|
327
339
|
if is_order_preserving_cast(self.obj.type(), plc_dtype):
|
|
328
340
|
return result.sorted_like(self)
|
|
329
341
|
return result
|
|
330
342
|
|
|
331
|
-
def _handle_string_cast(self, dtype: plc.DataType) -> plc.Column:
|
|
343
|
+
def _handle_string_cast(self, dtype: plc.DataType, stream: Stream) -> plc.Column:
|
|
332
344
|
if dtype.id() == plc.TypeId.STRING:
|
|
333
345
|
if is_floating_point(self.obj.type()):
|
|
334
|
-
return from_floats(self.obj)
|
|
346
|
+
return from_floats(self.obj, stream=stream)
|
|
335
347
|
else:
|
|
336
|
-
return from_integers(self.obj)
|
|
348
|
+
return from_integers(self.obj, stream=stream)
|
|
337
349
|
else:
|
|
338
350
|
if is_floating_point(dtype):
|
|
339
|
-
floats = is_float(self.obj)
|
|
351
|
+
floats = is_float(self.obj, stream=stream)
|
|
340
352
|
if not plc.reduce.reduce(
|
|
341
353
|
floats,
|
|
342
354
|
plc.aggregation.all(),
|
|
343
355
|
plc.DataType(plc.TypeId.BOOL8),
|
|
356
|
+
stream=stream,
|
|
344
357
|
).to_py():
|
|
345
358
|
raise InvalidOperationError("Conversion from `str` failed.")
|
|
346
359
|
return to_floats(self.obj, dtype)
|
|
347
360
|
else:
|
|
348
|
-
integers = is_integer(self.obj)
|
|
361
|
+
integers = is_integer(self.obj, stream=stream)
|
|
349
362
|
if not plc.reduce.reduce(
|
|
350
363
|
integers,
|
|
351
364
|
plc.aggregation.all(),
|
|
352
365
|
plc.DataType(plc.TypeId.BOOL8),
|
|
366
|
+
stream=stream,
|
|
353
367
|
).to_py():
|
|
354
368
|
raise InvalidOperationError("Conversion from `str` failed.")
|
|
355
|
-
return to_integers(self.obj, dtype)
|
|
369
|
+
return to_integers(self.obj, dtype, stream=stream)
|
|
356
370
|
|
|
357
|
-
def copy_metadata(self, from_:
|
|
371
|
+
def copy_metadata(self, from_: pl_Series, /) -> Self:
|
|
358
372
|
"""
|
|
359
373
|
Copy metadata from a host series onto self.
|
|
360
374
|
|
|
@@ -439,27 +453,44 @@ class Column:
|
|
|
439
453
|
dtype=self.dtype,
|
|
440
454
|
)
|
|
441
455
|
|
|
442
|
-
def mask_nans(self) -> Self:
|
|
456
|
+
def mask_nans(self, stream: Stream) -> Self:
|
|
443
457
|
"""Return a shallow copy of self with nans masked out."""
|
|
444
458
|
if plc.traits.is_floating_point(self.obj.type()):
|
|
445
459
|
old_count = self.null_count
|
|
446
|
-
mask, new_count = plc.transform.nans_to_nulls(self.obj)
|
|
460
|
+
mask, new_count = plc.transform.nans_to_nulls(self.obj, stream=stream)
|
|
447
461
|
result = type(self)(self.obj.with_mask(mask, new_count), self.dtype)
|
|
448
462
|
if old_count == new_count:
|
|
449
463
|
return result.sorted_like(self)
|
|
450
464
|
return result
|
|
451
465
|
return self.copy()
|
|
452
466
|
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
467
|
+
def nan_count(self, stream: Stream) -> int:
|
|
468
|
+
"""
|
|
469
|
+
Return the number of NaN values in the column.
|
|
470
|
+
|
|
471
|
+
Parameters
|
|
472
|
+
----------
|
|
473
|
+
stream
|
|
474
|
+
CUDA stream used for device memory operations and kernel launches.
|
|
475
|
+
``self.obj`` must be valid on this stream, and the result will be
|
|
476
|
+
valid on this stream.
|
|
477
|
+
|
|
478
|
+
Returns
|
|
479
|
+
-------
|
|
480
|
+
Number of NaN values in the column.
|
|
481
|
+
"""
|
|
482
|
+
result: int
|
|
456
483
|
if self.size > 0 and plc.traits.is_floating_point(self.obj.type()):
|
|
457
|
-
|
|
458
|
-
|
|
484
|
+
# See https://github.com/rapidsai/cudf/issues/20202 for we type ignore
|
|
485
|
+
result = plc.reduce.reduce( # type: ignore[assignment]
|
|
486
|
+
plc.unary.is_nan(self.obj, stream=stream),
|
|
459
487
|
plc.aggregation.sum(),
|
|
460
488
|
plc.types.SIZE_TYPE,
|
|
489
|
+
stream=stream,
|
|
461
490
|
).to_py()
|
|
462
|
-
|
|
491
|
+
else:
|
|
492
|
+
result = 0
|
|
493
|
+
return result
|
|
463
494
|
|
|
464
495
|
@property
|
|
465
496
|
def size(self) -> int:
|
|
@@ -471,7 +502,7 @@ class Column:
|
|
|
471
502
|
"""Return the number of Null values in the column."""
|
|
472
503
|
return self.obj.null_count()
|
|
473
504
|
|
|
474
|
-
def slice(self, zlice: Slice | None) -> Self:
|
|
505
|
+
def slice(self, zlice: Slice | None, stream: Stream) -> Self:
|
|
475
506
|
"""
|
|
476
507
|
Slice a column.
|
|
477
508
|
|
|
@@ -480,6 +511,9 @@ class Column:
|
|
|
480
511
|
zlice
|
|
481
512
|
optional, tuple of start and length, negative values of start
|
|
482
513
|
treated as for python indexing. If not provided, returns self.
|
|
514
|
+
stream
|
|
515
|
+
CUDA stream used for device memory operations and kernel launches
|
|
516
|
+
on this Column. The data in ``self.obj`` must be valid on this stream.
|
|
483
517
|
|
|
484
518
|
Returns
|
|
485
519
|
-------
|
|
@@ -490,6 +524,7 @@ class Column:
|
|
|
490
524
|
(table,) = plc.copying.slice(
|
|
491
525
|
plc.Table([self.obj]),
|
|
492
526
|
conversion.from_polars_slice(zlice, num_rows=self.size),
|
|
527
|
+
stream=stream,
|
|
493
528
|
)
|
|
494
529
|
(column,) = table.columns()
|
|
495
530
|
return type(self)(column, name=self.name, dtype=self.dtype).sorted_like(self)
|