cudf-polars-cu12 25.6.0__py3-none-any.whl → 25.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudf_polars/GIT_COMMIT +1 -0
- cudf_polars/VERSION +1 -1
- cudf_polars/callback.py +22 -22
- cudf_polars/containers/__init__.py +4 -2
- cudf_polars/containers/column.py +116 -42
- cudf_polars/containers/dataframe.py +71 -22
- cudf_polars/containers/datatype.py +137 -0
- cudf_polars/dsl/expr.py +2 -0
- cudf_polars/dsl/expressions/aggregation.py +31 -15
- cudf_polars/dsl/expressions/base.py +9 -5
- cudf_polars/dsl/expressions/binaryop.py +26 -5
- cudf_polars/dsl/expressions/boolean.py +91 -49
- cudf_polars/dsl/expressions/datetime.py +30 -35
- cudf_polars/dsl/expressions/literal.py +23 -11
- cudf_polars/dsl/expressions/rolling.py +521 -19
- cudf_polars/dsl/expressions/selection.py +7 -7
- cudf_polars/dsl/expressions/slicing.py +4 -5
- cudf_polars/dsl/expressions/sorting.py +5 -4
- cudf_polars/dsl/expressions/string.py +715 -72
- cudf_polars/dsl/expressions/struct.py +137 -0
- cudf_polars/dsl/expressions/ternary.py +6 -3
- cudf_polars/dsl/expressions/unary.py +265 -47
- cudf_polars/dsl/ir.py +521 -279
- cudf_polars/dsl/nodebase.py +10 -3
- cudf_polars/dsl/to_ast.py +60 -21
- cudf_polars/dsl/tracing.py +16 -0
- cudf_polars/dsl/translate.py +119 -80
- cudf_polars/dsl/traversal.py +64 -15
- cudf_polars/dsl/utils/aggregations.py +212 -23
- cudf_polars/dsl/utils/groupby.py +8 -7
- cudf_polars/dsl/utils/replace.py +19 -4
- cudf_polars/dsl/utils/reshape.py +74 -0
- cudf_polars/dsl/utils/rolling.py +12 -4
- cudf_polars/dsl/utils/windows.py +8 -2
- cudf_polars/experimental/base.py +347 -2
- cudf_polars/experimental/benchmarks/pdsds.py +220 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/__init__.py +4 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q1.py +88 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q10.py +225 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q2.py +244 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q3.py +65 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q4.py +359 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q5.py +462 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q6.py +92 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q7.py +79 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q8.py +524 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q9.py +137 -0
- cudf_polars/experimental/benchmarks/pdsh.py +13 -478
- cudf_polars/experimental/benchmarks/utils.py +832 -0
- cudf_polars/experimental/dask_registers.py +13 -9
- cudf_polars/experimental/dispatch.py +80 -8
- cudf_polars/experimental/distinct.py +42 -19
- cudf_polars/experimental/explain.py +33 -3
- cudf_polars/experimental/expressions.py +94 -25
- cudf_polars/experimental/groupby.py +82 -43
- cudf_polars/experimental/io.py +656 -67
- cudf_polars/experimental/join.py +56 -18
- cudf_polars/experimental/parallel.py +112 -15
- cudf_polars/experimental/select.py +72 -5
- cudf_polars/experimental/shuffle.py +111 -51
- cudf_polars/experimental/sort.py +575 -11
- cudf_polars/experimental/statistics.py +795 -0
- cudf_polars/experimental/utils.py +72 -3
- cudf_polars/testing/asserts.py +105 -35
- cudf_polars/testing/io.py +52 -2
- cudf_polars/testing/plugin.py +68 -80
- cudf_polars/typing/__init__.py +41 -22
- cudf_polars/utils/config.py +479 -91
- cudf_polars/utils/dtypes.py +20 -127
- cudf_polars/utils/versions.py +9 -5
- {cudf_polars_cu12-25.6.0.dist-info → cudf_polars_cu12-25.10.0.dist-info}/METADATA +22 -18
- cudf_polars_cu12-25.10.0.dist-info/RECORD +92 -0
- cudf_polars_cu12-25.6.0.dist-info/RECORD +0 -73
- {cudf_polars_cu12-25.6.0.dist-info → cudf_polars_cu12-25.10.0.dist-info}/WHEEL +0 -0
- {cudf_polars_cu12-25.6.0.dist-info → cudf_polars_cu12-25.10.0.dist-info}/licenses/LICENSE +0 -0
- {cudf_polars_cu12-25.6.0.dist-info → cudf_polars_cu12-25.10.0.dist-info}/top_level.txt +0 -0
cudf_polars/GIT_COMMIT
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
f4e35ca02118eada383e7417273c6cb1857ec66e
|
cudf_polars/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
25.
|
|
1
|
+
25.10.00
|
cudf_polars/callback.py
CHANGED
|
@@ -7,6 +7,7 @@ from __future__ import annotations
|
|
|
7
7
|
|
|
8
8
|
import contextlib
|
|
9
9
|
import os
|
|
10
|
+
import textwrap
|
|
10
11
|
import time
|
|
11
12
|
import warnings
|
|
12
13
|
from functools import cache, partial
|
|
@@ -21,7 +22,9 @@ import pylibcudf
|
|
|
21
22
|
import rmm
|
|
22
23
|
from rmm._cuda import gpu
|
|
23
24
|
|
|
25
|
+
from cudf_polars.dsl.tracing import CUDF_POLARS_NVTX_DOMAIN
|
|
24
26
|
from cudf_polars.dsl.translate import Translator
|
|
27
|
+
from cudf_polars.utils.config import _env_get_int, get_total_device_memory
|
|
25
28
|
from cudf_polars.utils.timer import Timer
|
|
26
29
|
|
|
27
30
|
if TYPE_CHECKING:
|
|
@@ -37,21 +40,6 @@ if TYPE_CHECKING:
|
|
|
37
40
|
__all__: list[str] = ["execute_with_cudf"]
|
|
38
41
|
|
|
39
42
|
|
|
40
|
-
_SUPPORTED_PREFETCHES = {
|
|
41
|
-
"column_view::get_data",
|
|
42
|
-
"mutable_column_view::get_data",
|
|
43
|
-
"gather",
|
|
44
|
-
"hash_join",
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
def _env_get_int(name: str, default: int) -> int:
|
|
49
|
-
try:
|
|
50
|
-
return int(os.getenv(name, default))
|
|
51
|
-
except (ValueError, TypeError): # pragma: no cover
|
|
52
|
-
return default # pragma: no cover
|
|
53
|
-
|
|
54
|
-
|
|
55
43
|
@cache
|
|
56
44
|
def default_memory_resource(
|
|
57
45
|
device: int,
|
|
@@ -84,8 +72,7 @@ def default_memory_resource(
|
|
|
84
72
|
# Leaving a 20% headroom to avoid OOM errors.
|
|
85
73
|
free_memory, _ = rmm.mr.available_device_memory()
|
|
86
74
|
free_memory = int(round(float(free_memory) * 0.80 / 256) * 256)
|
|
87
|
-
|
|
88
|
-
pylibcudf.experimental.enable_prefetching(key)
|
|
75
|
+
pylibcudf.prefetch.enable()
|
|
89
76
|
mr = rmm.mr.PrefetchResourceAdaptor(
|
|
90
77
|
rmm.mr.PoolMemoryResource(
|
|
91
78
|
rmm.mr.ManagedMemoryResource(),
|
|
@@ -102,8 +89,7 @@ def default_memory_resource(
|
|
|
102
89
|
):
|
|
103
90
|
raise ComputeError(
|
|
104
91
|
"GPU engine requested, but incorrect cudf-polars package installed. "
|
|
105
|
-
"
|
|
106
|
-
"and install `cudf-polars-cu11`"
|
|
92
|
+
"cudf-polars requires CUDA 12.0+ to installed."
|
|
107
93
|
) from None
|
|
108
94
|
else:
|
|
109
95
|
raise
|
|
@@ -140,7 +126,11 @@ def set_memory_resource(
|
|
|
140
126
|
mr = default_memory_resource(
|
|
141
127
|
device=device,
|
|
142
128
|
cuda_managed_memory=bool(
|
|
143
|
-
_env_get_int(
|
|
129
|
+
_env_get_int(
|
|
130
|
+
"POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY",
|
|
131
|
+
default=1 if get_total_device_memory() is not None else 0,
|
|
132
|
+
)
|
|
133
|
+
!= 0
|
|
144
134
|
),
|
|
145
135
|
)
|
|
146
136
|
rmm.mr.set_current_device_resource(mr)
|
|
@@ -222,7 +212,7 @@ def _callback(
|
|
|
222
212
|
if timer is not None:
|
|
223
213
|
assert should_time
|
|
224
214
|
with (
|
|
225
|
-
nvtx.annotate(message="ExecuteIR", domain=
|
|
215
|
+
nvtx.annotate(message="ExecuteIR", domain=CUDF_POLARS_NVTX_DOMAIN),
|
|
226
216
|
# Device must be set before memory resource is obtained.
|
|
227
217
|
set_device(config_options.device),
|
|
228
218
|
set_memory_resource(memory_resource),
|
|
@@ -236,6 +226,16 @@ def _callback(
|
|
|
236
226
|
elif config_options.executor.name == "streaming":
|
|
237
227
|
from cudf_polars.experimental.parallel import evaluate_streaming
|
|
238
228
|
|
|
229
|
+
if timer is not None:
|
|
230
|
+
msg = textwrap.dedent("""\
|
|
231
|
+
LazyFrame.profile() is not supported with the streaming executor.
|
|
232
|
+
To profile execution with the streaming executor, use:
|
|
233
|
+
|
|
234
|
+
- NVIDIA NSight Systems with the 'streaming' scheduler.
|
|
235
|
+
- Dask's built-in profiling tools with the 'distributed' scheduler.
|
|
236
|
+
""")
|
|
237
|
+
raise NotImplementedError(msg)
|
|
238
|
+
|
|
239
239
|
return evaluate_streaming(ir, config_options).to_polars()
|
|
240
240
|
assert_never(f"Unknown executor '{config_options.executor}'")
|
|
241
241
|
|
|
@@ -277,7 +277,7 @@ def execute_with_cudf(
|
|
|
277
277
|
|
|
278
278
|
memory_resource = config.memory_resource
|
|
279
279
|
|
|
280
|
-
with nvtx.annotate(message="ConvertIR", domain=
|
|
280
|
+
with nvtx.annotate(message="ConvertIR", domain=CUDF_POLARS_NVTX_DOMAIN):
|
|
281
281
|
translator = Translator(nt, config)
|
|
282
282
|
ir = translator.translate_ir()
|
|
283
283
|
ir_translation_errors = translator.errors
|
|
@@ -1,11 +1,13 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
4
|
"""Containers of concrete data."""
|
|
5
5
|
|
|
6
6
|
from __future__ import annotations
|
|
7
7
|
|
|
8
|
-
__all__: list[str] = ["Column", "DataFrame"]
|
|
8
|
+
__all__: list[str] = ["Column", "DataFrame", "DataType"]
|
|
9
9
|
|
|
10
|
+
# dataframe.py & column.py imports DataType, so import in this order to avoid circular import
|
|
11
|
+
from cudf_polars.containers.datatype import DataType # noqa: I001
|
|
10
12
|
from cudf_polars.containers.column import Column
|
|
11
13
|
from cudf_polars.containers.dataframe import DataFrame
|
cudf_polars/containers/column.py
CHANGED
|
@@ -8,6 +8,8 @@ from __future__ import annotations
|
|
|
8
8
|
import functools
|
|
9
9
|
from typing import TYPE_CHECKING
|
|
10
10
|
|
|
11
|
+
import polars as pl
|
|
12
|
+
import polars.datatypes.convert
|
|
11
13
|
from polars.exceptions import InvalidOperationError
|
|
12
14
|
|
|
13
15
|
import pylibcudf as plc
|
|
@@ -19,19 +21,39 @@ from pylibcudf.strings.convert.convert_integers import (
|
|
|
19
21
|
)
|
|
20
22
|
from pylibcudf.traits import is_floating_point
|
|
21
23
|
|
|
24
|
+
from cudf_polars.containers import DataType
|
|
22
25
|
from cudf_polars.utils import conversion
|
|
23
26
|
from cudf_polars.utils.dtypes import is_order_preserving_cast
|
|
24
27
|
|
|
25
28
|
if TYPE_CHECKING:
|
|
26
29
|
from typing_extensions import Self
|
|
27
30
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
+
from cudf_polars.typing import (
|
|
32
|
+
ColumnHeader,
|
|
33
|
+
ColumnOptions,
|
|
34
|
+
DeserializedColumnOptions,
|
|
35
|
+
Slice,
|
|
36
|
+
)
|
|
31
37
|
|
|
32
38
|
__all__: list[str] = ["Column"]
|
|
33
39
|
|
|
34
40
|
|
|
41
|
+
def _dtype_short_repr_to_dtype(dtype_str: str) -> pl.DataType:
|
|
42
|
+
"""Convert a Polars dtype short repr to a Polars dtype."""
|
|
43
|
+
# limitations of dtype_short_repr_to_dtype described in
|
|
44
|
+
# py-polars/polars/datatypes/convert.py#L299
|
|
45
|
+
if dtype_str.startswith("list["):
|
|
46
|
+
stripped = dtype_str.removeprefix("list[").removesuffix("]")
|
|
47
|
+
return pl.List(_dtype_short_repr_to_dtype(stripped))
|
|
48
|
+
pl_type = polars.datatypes.convert.dtype_short_repr_to_dtype(dtype_str)
|
|
49
|
+
if pl_type is None:
|
|
50
|
+
raise ValueError(f"{dtype_str} was not able to be parsed by Polars.")
|
|
51
|
+
if isinstance(pl_type, polars.datatypes.DataTypeClass):
|
|
52
|
+
return pl_type()
|
|
53
|
+
else:
|
|
54
|
+
return pl_type
|
|
55
|
+
|
|
56
|
+
|
|
35
57
|
class Column:
|
|
36
58
|
"""An immutable column with sortedness metadata."""
|
|
37
59
|
|
|
@@ -43,10 +65,12 @@ class Column:
|
|
|
43
65
|
# Optional name, only ever set by evaluation of NamedExpr nodes
|
|
44
66
|
# The internal evaluation should not care about the name.
|
|
45
67
|
name: str | None
|
|
68
|
+
dtype: DataType
|
|
46
69
|
|
|
47
70
|
def __init__(
|
|
48
71
|
self,
|
|
49
72
|
column: plc.Column,
|
|
73
|
+
dtype: DataType,
|
|
50
74
|
*,
|
|
51
75
|
is_sorted: plc.types.Sorted = plc.types.Sorted.NO,
|
|
52
76
|
order: plc.types.Order = plc.types.Order.ASCENDING,
|
|
@@ -56,6 +80,7 @@ class Column:
|
|
|
56
80
|
self.obj = column
|
|
57
81
|
self.is_scalar = self.size == 1
|
|
58
82
|
self.name = name
|
|
83
|
+
self.dtype = dtype
|
|
59
84
|
self.set_sorted(is_sorted=is_sorted, order=order, null_order=null_order)
|
|
60
85
|
|
|
61
86
|
@classmethod
|
|
@@ -81,7 +106,23 @@ class Column:
|
|
|
81
106
|
(plc_column,) = plc.contiguous_split.unpack_from_memoryviews(
|
|
82
107
|
packed_metadata, packed_gpu_data
|
|
83
108
|
).columns()
|
|
84
|
-
return cls(plc_column, **header["column_kwargs"])
|
|
109
|
+
return cls(plc_column, **cls.deserialize_ctor_kwargs(header["column_kwargs"]))
|
|
110
|
+
|
|
111
|
+
@staticmethod
|
|
112
|
+
def deserialize_ctor_kwargs(
|
|
113
|
+
column_kwargs: ColumnOptions,
|
|
114
|
+
) -> DeserializedColumnOptions:
|
|
115
|
+
"""Deserialize the constructor kwargs for a Column."""
|
|
116
|
+
dtype = DataType( # pragma: no cover
|
|
117
|
+
_dtype_short_repr_to_dtype(column_kwargs["dtype"])
|
|
118
|
+
)
|
|
119
|
+
return {
|
|
120
|
+
"is_sorted": column_kwargs["is_sorted"],
|
|
121
|
+
"order": column_kwargs["order"],
|
|
122
|
+
"null_order": column_kwargs["null_order"],
|
|
123
|
+
"name": column_kwargs["name"],
|
|
124
|
+
"dtype": dtype,
|
|
125
|
+
}
|
|
85
126
|
|
|
86
127
|
def serialize(
|
|
87
128
|
self,
|
|
@@ -105,17 +146,21 @@ class Column:
|
|
|
105
146
|
Two-tuple of frames suitable for passing to `plc.contiguous_split.unpack_from_memoryviews`
|
|
106
147
|
"""
|
|
107
148
|
packed = plc.contiguous_split.pack(plc.Table([self.obj]))
|
|
108
|
-
|
|
149
|
+
header: ColumnHeader = {
|
|
150
|
+
"column_kwargs": self.serialize_ctor_kwargs(),
|
|
151
|
+
"frame_count": 2,
|
|
152
|
+
}
|
|
153
|
+
return header, packed.release()
|
|
154
|
+
|
|
155
|
+
def serialize_ctor_kwargs(self) -> ColumnOptions:
|
|
156
|
+
"""Serialize the constructor kwargs for self."""
|
|
157
|
+
return {
|
|
109
158
|
"is_sorted": self.is_sorted,
|
|
110
159
|
"order": self.order,
|
|
111
160
|
"null_order": self.null_order,
|
|
112
161
|
"name": self.name,
|
|
162
|
+
"dtype": pl.polars.dtype_str_repr(self.dtype.polars),
|
|
113
163
|
}
|
|
114
|
-
header: ColumnHeader = {
|
|
115
|
-
"column_kwargs": column_kwargs,
|
|
116
|
-
"frame_count": 2,
|
|
117
|
-
}
|
|
118
|
-
return header, packed.release()
|
|
119
164
|
|
|
120
165
|
@functools.cached_property
|
|
121
166
|
def obj_scalar(self) -> plc.Scalar:
|
|
@@ -172,6 +217,7 @@ class Column:
|
|
|
172
217
|
return type(self)(
|
|
173
218
|
self.obj,
|
|
174
219
|
name=self.name,
|
|
220
|
+
dtype=self.dtype,
|
|
175
221
|
is_sorted=like.is_sorted,
|
|
176
222
|
order=like.order,
|
|
177
223
|
null_order=like.null_order,
|
|
@@ -202,11 +248,11 @@ class Column:
|
|
|
202
248
|
If the sortedness flag is not set, this launches a kernel to
|
|
203
249
|
check sortedness.
|
|
204
250
|
"""
|
|
205
|
-
if self.
|
|
251
|
+
if self.size <= 1 or self.size == self.null_count:
|
|
206
252
|
return True
|
|
207
253
|
if self.is_sorted == plc.types.Sorted.YES:
|
|
208
254
|
return self.order == order and (
|
|
209
|
-
self.
|
|
255
|
+
self.null_count == 0 or self.null_order == null_order
|
|
210
256
|
)
|
|
211
257
|
if plc.sorting.is_sorted(plc.Table([self.obj]), [order], [null_order]):
|
|
212
258
|
self.sorted = plc.types.Sorted.YES
|
|
@@ -215,7 +261,7 @@ class Column:
|
|
|
215
261
|
return True
|
|
216
262
|
return False
|
|
217
263
|
|
|
218
|
-
def astype(self, dtype:
|
|
264
|
+
def astype(self, dtype: DataType) -> Column:
|
|
219
265
|
"""
|
|
220
266
|
Cast the column to as the requested dtype.
|
|
221
267
|
|
|
@@ -238,14 +284,47 @@ class Column:
|
|
|
238
284
|
This only produces a copy if the requested dtype doesn't match
|
|
239
285
|
the current one.
|
|
240
286
|
"""
|
|
241
|
-
|
|
287
|
+
plc_dtype = dtype.plc
|
|
288
|
+
if self.obj.type() == plc_dtype:
|
|
242
289
|
return self
|
|
243
290
|
|
|
244
|
-
if
|
|
245
|
-
|
|
291
|
+
if (
|
|
292
|
+
plc_dtype.id() == plc.TypeId.STRING
|
|
293
|
+
or self.obj.type().id() == plc.TypeId.STRING
|
|
294
|
+
):
|
|
295
|
+
return Column(self._handle_string_cast(plc_dtype), dtype=dtype)
|
|
296
|
+
elif plc.traits.is_integral_not_bool(
|
|
297
|
+
self.obj.type()
|
|
298
|
+
) and plc.traits.is_timestamp(plc_dtype):
|
|
299
|
+
upcasted = plc.unary.cast(self.obj, plc.DataType(plc.TypeId.INT64))
|
|
300
|
+
result = plc.column.Column(
|
|
301
|
+
plc_dtype,
|
|
302
|
+
upcasted.size(),
|
|
303
|
+
upcasted.data(),
|
|
304
|
+
upcasted.null_mask(),
|
|
305
|
+
upcasted.null_count(),
|
|
306
|
+
upcasted.offset(),
|
|
307
|
+
upcasted.children(),
|
|
308
|
+
)
|
|
309
|
+
return Column(result, dtype=dtype).sorted_like(self)
|
|
310
|
+
elif plc.traits.is_integral_not_bool(plc_dtype) and plc.traits.is_timestamp(
|
|
311
|
+
self.obj.type()
|
|
312
|
+
):
|
|
313
|
+
result = plc.column.Column(
|
|
314
|
+
plc.DataType(plc.TypeId.INT64),
|
|
315
|
+
self.obj.size(),
|
|
316
|
+
self.obj.data(),
|
|
317
|
+
self.obj.null_mask(),
|
|
318
|
+
self.obj.null_count(),
|
|
319
|
+
self.obj.offset(),
|
|
320
|
+
self.obj.children(),
|
|
321
|
+
)
|
|
322
|
+
return Column(plc.unary.cast(result, plc_dtype), dtype=dtype).sorted_like(
|
|
323
|
+
self
|
|
324
|
+
)
|
|
246
325
|
else:
|
|
247
|
-
result = Column(plc.unary.cast(self.obj, dtype)
|
|
248
|
-
if is_order_preserving_cast(self.obj.type(),
|
|
326
|
+
result = Column(plc.unary.cast(self.obj, plc_dtype), dtype=dtype)
|
|
327
|
+
if is_order_preserving_cast(self.obj.type(), plc_dtype):
|
|
249
328
|
return result.sorted_like(self)
|
|
250
329
|
return result
|
|
251
330
|
|
|
@@ -258,24 +337,20 @@ class Column:
|
|
|
258
337
|
else:
|
|
259
338
|
if is_floating_point(dtype):
|
|
260
339
|
floats = is_float(self.obj)
|
|
261
|
-
if not plc.
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
)
|
|
267
|
-
).as_py():
|
|
340
|
+
if not plc.reduce.reduce(
|
|
341
|
+
floats,
|
|
342
|
+
plc.aggregation.all(),
|
|
343
|
+
plc.DataType(plc.TypeId.BOOL8),
|
|
344
|
+
).to_py():
|
|
268
345
|
raise InvalidOperationError("Conversion from `str` failed.")
|
|
269
346
|
return to_floats(self.obj, dtype)
|
|
270
347
|
else:
|
|
271
348
|
integers = is_integer(self.obj)
|
|
272
|
-
if not plc.
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
)
|
|
278
|
-
).as_py():
|
|
349
|
+
if not plc.reduce.reduce(
|
|
350
|
+
integers,
|
|
351
|
+
plc.aggregation.all(),
|
|
352
|
+
plc.DataType(plc.TypeId.BOOL8),
|
|
353
|
+
).to_py():
|
|
279
354
|
raise InvalidOperationError("Conversion from `str` failed.")
|
|
280
355
|
return to_integers(self.obj, dtype)
|
|
281
356
|
|
|
@@ -361,6 +436,7 @@ class Column:
|
|
|
361
436
|
order=self.order,
|
|
362
437
|
null_order=self.null_order,
|
|
363
438
|
name=self.name,
|
|
439
|
+
dtype=self.dtype,
|
|
364
440
|
)
|
|
365
441
|
|
|
366
442
|
def mask_nans(self) -> Self:
|
|
@@ -368,7 +444,7 @@ class Column:
|
|
|
368
444
|
if plc.traits.is_floating_point(self.obj.type()):
|
|
369
445
|
old_count = self.null_count
|
|
370
446
|
mask, new_count = plc.transform.nans_to_nulls(self.obj)
|
|
371
|
-
result = type(self)(self.obj.with_mask(mask, new_count))
|
|
447
|
+
result = type(self)(self.obj.with_mask(mask, new_count), self.dtype)
|
|
372
448
|
if old_count == new_count:
|
|
373
449
|
return result.sorted_like(self)
|
|
374
450
|
return result
|
|
@@ -377,14 +453,12 @@ class Column:
|
|
|
377
453
|
@functools.cached_property
|
|
378
454
|
def nan_count(self) -> int:
|
|
379
455
|
"""Return the number of NaN values in the column."""
|
|
380
|
-
if plc.traits.is_floating_point(self.obj.type()):
|
|
381
|
-
return plc.
|
|
382
|
-
plc.
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
)
|
|
387
|
-
).as_py()
|
|
456
|
+
if self.size > 0 and plc.traits.is_floating_point(self.obj.type()):
|
|
457
|
+
return plc.reduce.reduce(
|
|
458
|
+
plc.unary.is_nan(self.obj),
|
|
459
|
+
plc.aggregation.sum(),
|
|
460
|
+
plc.types.SIZE_TYPE,
|
|
461
|
+
).to_py()
|
|
388
462
|
return 0
|
|
389
463
|
|
|
390
464
|
@property
|
|
@@ -418,4 +492,4 @@ class Column:
|
|
|
418
492
|
conversion.from_polars_slice(zlice, num_rows=self.size),
|
|
419
493
|
)
|
|
420
494
|
(column,) = table.columns()
|
|
421
|
-
return type(self)(column, name=self.name).sorted_like(self)
|
|
495
|
+
return type(self)(column, name=self.name, dtype=self.dtype).sorted_like(self)
|
|
@@ -12,20 +12,60 @@ import polars as pl
|
|
|
12
12
|
|
|
13
13
|
import pylibcudf as plc
|
|
14
14
|
|
|
15
|
-
from cudf_polars.containers import Column
|
|
15
|
+
from cudf_polars.containers import Column, DataType
|
|
16
16
|
from cudf_polars.utils import conversion
|
|
17
17
|
|
|
18
18
|
if TYPE_CHECKING:
|
|
19
19
|
from collections.abc import Iterable, Mapping, Sequence, Set
|
|
20
20
|
|
|
21
|
-
from typing_extensions import Any, Self
|
|
21
|
+
from typing_extensions import Any, CapsuleType, Self
|
|
22
22
|
|
|
23
|
-
from cudf_polars.typing import ColumnOptions, DataFrameHeader, Slice
|
|
23
|
+
from cudf_polars.typing import ColumnOptions, DataFrameHeader, PolarsDataType, Slice
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
__all__: list[str] = ["DataFrame"]
|
|
27
27
|
|
|
28
28
|
|
|
29
|
+
def _create_polars_column_metadata(
|
|
30
|
+
name: str, dtype: PolarsDataType
|
|
31
|
+
) -> plc.interop.ColumnMetadata:
|
|
32
|
+
"""Create ColumnMetadata preserving dtype attributes not supported by libcudf."""
|
|
33
|
+
children_meta = []
|
|
34
|
+
timezone = ""
|
|
35
|
+
precision: int | None = None
|
|
36
|
+
|
|
37
|
+
if isinstance(dtype, pl.Struct):
|
|
38
|
+
children_meta = [
|
|
39
|
+
_create_polars_column_metadata(field.name, field.dtype)
|
|
40
|
+
for field in dtype.fields
|
|
41
|
+
]
|
|
42
|
+
elif isinstance(dtype, pl.Datetime):
|
|
43
|
+
timezone = dtype.time_zone or timezone
|
|
44
|
+
elif isinstance(dtype, pl.Decimal):
|
|
45
|
+
precision = dtype.precision
|
|
46
|
+
|
|
47
|
+
return plc.interop.ColumnMetadata(
|
|
48
|
+
name=name,
|
|
49
|
+
timezone=timezone,
|
|
50
|
+
precision=precision,
|
|
51
|
+
children_meta=children_meta,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# This is also defined in pylibcudf.interop
|
|
56
|
+
class _ObjectWithArrowMetadata:
|
|
57
|
+
def __init__(
|
|
58
|
+
self, obj: plc.Table | plc.Column, metadata: list[plc.interop.ColumnMetadata]
|
|
59
|
+
) -> None:
|
|
60
|
+
self.obj = obj
|
|
61
|
+
self.metadata = metadata
|
|
62
|
+
|
|
63
|
+
def __arrow_c_array__(
|
|
64
|
+
self, requested_schema: None = None
|
|
65
|
+
) -> tuple[CapsuleType, CapsuleType]:
|
|
66
|
+
return self.obj._to_schema(self.metadata), self.obj._to_host_array()
|
|
67
|
+
|
|
68
|
+
|
|
29
69
|
# Pacify the type checker. DataFrame init asserts that all the columns
|
|
30
70
|
# have a string name, so let's narrow the type.
|
|
31
71
|
class NamedColumn(Column):
|
|
@@ -44,6 +84,7 @@ class DataFrame:
|
|
|
44
84
|
if any(c.name is None for c in columns):
|
|
45
85
|
raise ValueError("All columns must have a name")
|
|
46
86
|
self.columns = [cast(NamedColumn, c) for c in columns]
|
|
87
|
+
self.dtypes = [c.dtype for c in self.columns]
|
|
47
88
|
self.column_map = {c.name: c for c in self.columns}
|
|
48
89
|
self.table = plc.Table([c.obj for c in self.columns])
|
|
49
90
|
|
|
@@ -60,11 +101,12 @@ class DataFrame:
|
|
|
60
101
|
# To guarantee we produce correct names, we therefore
|
|
61
102
|
# serialise with names we control and rename with that map.
|
|
62
103
|
name_map = {f"column_{i}": name for i, name in enumerate(self.column_map)}
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
104
|
+
metadata = [
|
|
105
|
+
_create_polars_column_metadata(name, dtype.polars)
|
|
106
|
+
for name, dtype in zip(name_map, self.dtypes, strict=True)
|
|
107
|
+
]
|
|
108
|
+
table_with_metadata = _ObjectWithArrowMetadata(self.table, metadata)
|
|
109
|
+
df = pl.DataFrame(table_with_metadata)
|
|
68
110
|
return df.rename(name_map).with_columns(
|
|
69
111
|
pl.col(c.name).set_sorted(descending=c.order == plc.types.Order.DESCENDING)
|
|
70
112
|
if c.is_sorted
|
|
@@ -106,16 +148,18 @@ class DataFrame:
|
|
|
106
148
|
-------
|
|
107
149
|
New dataframe representing the input.
|
|
108
150
|
"""
|
|
109
|
-
plc_table = plc.Table(df)
|
|
151
|
+
plc_table = plc.Table.from_arrow(df)
|
|
110
152
|
return cls(
|
|
111
|
-
Column(d_col, name=name).copy_metadata(h_col)
|
|
153
|
+
Column(d_col, name=name, dtype=DataType(h_col.dtype)).copy_metadata(h_col)
|
|
112
154
|
for d_col, h_col, name in zip(
|
|
113
155
|
plc_table.columns(), df.iter_columns(), df.columns, strict=True
|
|
114
156
|
)
|
|
115
157
|
)
|
|
116
158
|
|
|
117
159
|
@classmethod
|
|
118
|
-
def from_table(
|
|
160
|
+
def from_table(
|
|
161
|
+
cls, table: plc.Table, names: Sequence[str], dtypes: Sequence[DataType]
|
|
162
|
+
) -> Self:
|
|
119
163
|
"""
|
|
120
164
|
Create from a pylibcudf table.
|
|
121
165
|
|
|
@@ -125,6 +169,8 @@ class DataFrame:
|
|
|
125
169
|
Pylibcudf table to obtain columns from
|
|
126
170
|
names
|
|
127
171
|
Names for the columns
|
|
172
|
+
dtypes
|
|
173
|
+
Dtypes for the columns
|
|
128
174
|
|
|
129
175
|
Returns
|
|
130
176
|
-------
|
|
@@ -139,7 +185,8 @@ class DataFrame:
|
|
|
139
185
|
if table.num_columns() != len(names):
|
|
140
186
|
raise ValueError("Mismatching name and table length.")
|
|
141
187
|
return cls(
|
|
142
|
-
Column(c, name=name
|
|
188
|
+
Column(c, name=name, dtype=dtype)
|
|
189
|
+
for c, name, dtype in zip(table.columns(), names, dtypes, strict=True)
|
|
143
190
|
)
|
|
144
191
|
|
|
145
192
|
@classmethod
|
|
@@ -166,7 +213,7 @@ class DataFrame:
|
|
|
166
213
|
packed_metadata, packed_gpu_data
|
|
167
214
|
)
|
|
168
215
|
return cls(
|
|
169
|
-
Column(c, **kw)
|
|
216
|
+
Column(c, **Column.deserialize_ctor_kwargs(kw))
|
|
170
217
|
for c, kw in zip(table.columns(), header["columns_kwargs"], strict=True)
|
|
171
218
|
)
|
|
172
219
|
|
|
@@ -195,13 +242,7 @@ class DataFrame:
|
|
|
195
242
|
|
|
196
243
|
# Keyword arguments for `Column.__init__`.
|
|
197
244
|
columns_kwargs: list[ColumnOptions] = [
|
|
198
|
-
|
|
199
|
-
"is_sorted": col.is_sorted,
|
|
200
|
-
"order": col.order,
|
|
201
|
-
"null_order": col.null_order,
|
|
202
|
-
"name": col.name,
|
|
203
|
-
}
|
|
204
|
-
for col in self.columns
|
|
245
|
+
col.serialize_ctor_kwargs() for col in self.columns
|
|
205
246
|
]
|
|
206
247
|
header: DataFrameHeader = {
|
|
207
248
|
"columns_kwargs": columns_kwargs,
|
|
@@ -288,7 +329,11 @@ class DataFrame:
|
|
|
288
329
|
def filter(self, mask: Column) -> Self:
|
|
289
330
|
"""Return a filtered table given a mask."""
|
|
290
331
|
table = plc.stream_compaction.apply_boolean_mask(self.table, mask.obj)
|
|
291
|
-
return
|
|
332
|
+
return (
|
|
333
|
+
type(self)
|
|
334
|
+
.from_table(table, self.column_names, self.dtypes)
|
|
335
|
+
.sorted_like(self)
|
|
336
|
+
)
|
|
292
337
|
|
|
293
338
|
def slice(self, zlice: Slice | None) -> Self:
|
|
294
339
|
"""
|
|
@@ -309,4 +354,8 @@ class DataFrame:
|
|
|
309
354
|
(table,) = plc.copying.slice(
|
|
310
355
|
self.table, conversion.from_polars_slice(zlice, num_rows=self.num_rows)
|
|
311
356
|
)
|
|
312
|
-
return
|
|
357
|
+
return (
|
|
358
|
+
type(self)
|
|
359
|
+
.from_table(table, self.column_names, self.dtypes)
|
|
360
|
+
.sorted_like(self)
|
|
361
|
+
)
|