cudf-polars-cu12 25.2.2__py3-none-any.whl → 25.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudf_polars/VERSION +1 -1
- cudf_polars/callback.py +82 -65
- cudf_polars/containers/column.py +138 -7
- cudf_polars/containers/dataframe.py +26 -39
- cudf_polars/dsl/expr.py +3 -1
- cudf_polars/dsl/expressions/aggregation.py +27 -63
- cudf_polars/dsl/expressions/base.py +40 -72
- cudf_polars/dsl/expressions/binaryop.py +5 -41
- cudf_polars/dsl/expressions/boolean.py +25 -53
- cudf_polars/dsl/expressions/datetime.py +97 -17
- cudf_polars/dsl/expressions/literal.py +27 -33
- cudf_polars/dsl/expressions/rolling.py +110 -9
- cudf_polars/dsl/expressions/selection.py +8 -26
- cudf_polars/dsl/expressions/slicing.py +47 -0
- cudf_polars/dsl/expressions/sorting.py +5 -18
- cudf_polars/dsl/expressions/string.py +33 -36
- cudf_polars/dsl/expressions/ternary.py +3 -10
- cudf_polars/dsl/expressions/unary.py +35 -75
- cudf_polars/dsl/ir.py +749 -212
- cudf_polars/dsl/nodebase.py +8 -1
- cudf_polars/dsl/to_ast.py +5 -3
- cudf_polars/dsl/translate.py +319 -171
- cudf_polars/dsl/utils/__init__.py +8 -0
- cudf_polars/dsl/utils/aggregations.py +292 -0
- cudf_polars/dsl/utils/groupby.py +97 -0
- cudf_polars/dsl/utils/naming.py +34 -0
- cudf_polars/dsl/utils/replace.py +46 -0
- cudf_polars/dsl/utils/rolling.py +113 -0
- cudf_polars/dsl/utils/windows.py +186 -0
- cudf_polars/experimental/base.py +17 -19
- cudf_polars/experimental/benchmarks/__init__.py +4 -0
- cudf_polars/experimental/benchmarks/pdsh.py +1279 -0
- cudf_polars/experimental/dask_registers.py +196 -0
- cudf_polars/experimental/distinct.py +174 -0
- cudf_polars/experimental/explain.py +127 -0
- cudf_polars/experimental/expressions.py +521 -0
- cudf_polars/experimental/groupby.py +288 -0
- cudf_polars/experimental/io.py +58 -29
- cudf_polars/experimental/join.py +353 -0
- cudf_polars/experimental/parallel.py +166 -93
- cudf_polars/experimental/repartition.py +69 -0
- cudf_polars/experimental/scheduler.py +155 -0
- cudf_polars/experimental/select.py +92 -7
- cudf_polars/experimental/shuffle.py +294 -0
- cudf_polars/experimental/sort.py +45 -0
- cudf_polars/experimental/spilling.py +151 -0
- cudf_polars/experimental/utils.py +100 -0
- cudf_polars/testing/asserts.py +146 -6
- cudf_polars/testing/io.py +72 -0
- cudf_polars/testing/plugin.py +78 -76
- cudf_polars/typing/__init__.py +59 -6
- cudf_polars/utils/config.py +353 -0
- cudf_polars/utils/conversion.py +40 -0
- cudf_polars/utils/dtypes.py +22 -5
- cudf_polars/utils/timer.py +39 -0
- cudf_polars/utils/versions.py +5 -4
- {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/METADATA +10 -7
- cudf_polars_cu12-25.6.0.dist-info/RECORD +73 -0
- {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/WHEEL +1 -1
- cudf_polars/experimental/dask_serialize.py +0 -59
- cudf_polars_cu12-25.2.2.dist-info/RECORD +0 -48
- {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info/licenses}/LICENSE +0 -0
- {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/top_level.txt +0 -0
cudf_polars/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
25.
|
|
1
|
+
25.06.00
|
cudf_polars/callback.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
4
|
"""Callback for the polars collect function to execute on device."""
|
|
@@ -7,11 +7,13 @@ from __future__ import annotations
|
|
|
7
7
|
|
|
8
8
|
import contextlib
|
|
9
9
|
import os
|
|
10
|
+
import time
|
|
10
11
|
import warnings
|
|
11
12
|
from functools import cache, partial
|
|
12
|
-
from typing import TYPE_CHECKING, Literal
|
|
13
|
+
from typing import TYPE_CHECKING, Literal, overload
|
|
13
14
|
|
|
14
15
|
import nvtx
|
|
16
|
+
from typing_extensions import assert_never
|
|
15
17
|
|
|
16
18
|
from polars.exceptions import ComputeError, PerformanceWarning
|
|
17
19
|
|
|
@@ -20,6 +22,7 @@ import rmm
|
|
|
20
22
|
from rmm._cuda import gpu
|
|
21
23
|
|
|
22
24
|
from cudf_polars.dsl.translate import Translator
|
|
25
|
+
from cudf_polars.utils.timer import Timer
|
|
23
26
|
|
|
24
27
|
if TYPE_CHECKING:
|
|
25
28
|
from collections.abc import Generator
|
|
@@ -29,6 +32,7 @@ if TYPE_CHECKING:
|
|
|
29
32
|
|
|
30
33
|
from cudf_polars.dsl.ir import IR
|
|
31
34
|
from cudf_polars.typing import NodeTraverser
|
|
35
|
+
from cudf_polars.utils.config import ConfigOptions
|
|
32
36
|
|
|
33
37
|
__all__: list[str] = ["execute_with_cudf"]
|
|
34
38
|
|
|
@@ -41,7 +45,7 @@ _SUPPORTED_PREFETCHES = {
|
|
|
41
45
|
}
|
|
42
46
|
|
|
43
47
|
|
|
44
|
-
def _env_get_int(name, default):
|
|
48
|
+
def _env_get_int(name: str, default: int) -> int:
|
|
45
49
|
try:
|
|
46
50
|
return int(os.getenv(name, default))
|
|
47
51
|
except (ValueError, TypeError): # pragma: no cover
|
|
@@ -173,74 +177,72 @@ def set_device(device: int | None) -> Generator[int, None, None]:
|
|
|
173
177
|
gpu.setDevice(previous)
|
|
174
178
|
|
|
175
179
|
|
|
180
|
+
@overload
|
|
176
181
|
def _callback(
|
|
177
182
|
ir: IR,
|
|
178
183
|
with_columns: list[str] | None,
|
|
179
184
|
pyarrow_predicate: str | None,
|
|
180
185
|
n_rows: int | None,
|
|
186
|
+
should_time: Literal[False],
|
|
181
187
|
*,
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
) -> pl.DataFrame:
|
|
188
|
+
memory_resource: rmm.mr.DeviceMemoryResource | None,
|
|
189
|
+
config_options: ConfigOptions,
|
|
190
|
+
timer: Timer | None,
|
|
191
|
+
) -> pl.DataFrame: ...
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
@overload
|
|
195
|
+
def _callback(
|
|
196
|
+
ir: IR,
|
|
197
|
+
with_columns: list[str] | None,
|
|
198
|
+
pyarrow_predicate: str | None,
|
|
199
|
+
n_rows: int | None,
|
|
200
|
+
should_time: Literal[True],
|
|
201
|
+
*,
|
|
202
|
+
memory_resource: rmm.mr.DeviceMemoryResource | None,
|
|
203
|
+
config_options: ConfigOptions,
|
|
204
|
+
timer: Timer | None,
|
|
205
|
+
) -> tuple[pl.DataFrame, list[tuple[int, int, str]]]: ...
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def _callback(
|
|
209
|
+
ir: IR,
|
|
210
|
+
with_columns: list[str] | None,
|
|
211
|
+
pyarrow_predicate: str | None,
|
|
212
|
+
n_rows: int | None,
|
|
213
|
+
should_time: bool, # noqa: FBT001
|
|
214
|
+
*,
|
|
215
|
+
memory_resource: rmm.mr.DeviceMemoryResource | None,
|
|
216
|
+
config_options: ConfigOptions,
|
|
217
|
+
timer: Timer | None,
|
|
218
|
+
) -> pl.DataFrame | tuple[pl.DataFrame, list[tuple[int, int, str]]]:
|
|
186
219
|
assert with_columns is None
|
|
187
220
|
assert pyarrow_predicate is None
|
|
188
221
|
assert n_rows is None
|
|
222
|
+
if timer is not None:
|
|
223
|
+
assert should_time
|
|
189
224
|
with (
|
|
190
225
|
nvtx.annotate(message="ExecuteIR", domain="cudf_polars"),
|
|
191
226
|
# Device must be set before memory resource is obtained.
|
|
192
|
-
set_device(device),
|
|
227
|
+
set_device(config_options.device),
|
|
193
228
|
set_memory_resource(memory_resource),
|
|
194
229
|
):
|
|
195
|
-
if executor
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
config
|
|
212
|
-
Configuration options to validate.
|
|
213
|
-
|
|
214
|
-
Raises
|
|
215
|
-
------
|
|
216
|
-
ValueError
|
|
217
|
-
If the configuration contains unsupported options.
|
|
218
|
-
"""
|
|
219
|
-
if unsupported := (
|
|
220
|
-
config.keys()
|
|
221
|
-
- {"raise_on_fail", "parquet_options", "executor", "executor_options"}
|
|
222
|
-
):
|
|
223
|
-
raise ValueError(
|
|
224
|
-
f"Engine configuration contains unsupported settings: {unsupported}"
|
|
225
|
-
)
|
|
226
|
-
assert {"chunked", "chunk_read_limit", "pass_read_limit"}.issuperset(
|
|
227
|
-
config.get("parquet_options", {})
|
|
228
|
-
)
|
|
229
|
-
|
|
230
|
-
# Validate executor_options
|
|
231
|
-
executor = config.get("executor", "pylibcudf")
|
|
232
|
-
if executor == "dask-experimental":
|
|
233
|
-
unsupported = config.get("executor_options", {}).keys() - {
|
|
234
|
-
"max_rows_per_partition",
|
|
235
|
-
"parquet_blocksize",
|
|
236
|
-
}
|
|
237
|
-
else:
|
|
238
|
-
unsupported = config.get("executor_options", {}).keys()
|
|
239
|
-
if unsupported:
|
|
240
|
-
raise ValueError(f"Unsupported executor_options for {executor}: {unsupported}")
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None:
|
|
230
|
+
if config_options.executor.name == "in-memory":
|
|
231
|
+
df = ir.evaluate(cache={}, timer=timer).to_polars()
|
|
232
|
+
if timer is None:
|
|
233
|
+
return df
|
|
234
|
+
else:
|
|
235
|
+
return df, timer.timings
|
|
236
|
+
elif config_options.executor.name == "streaming":
|
|
237
|
+
from cudf_polars.experimental.parallel import evaluate_streaming
|
|
238
|
+
|
|
239
|
+
return evaluate_streaming(ir, config_options).to_polars()
|
|
240
|
+
assert_never(f"Unknown executor '{config_options.executor}'")
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def execute_with_cudf(
|
|
244
|
+
nt: NodeTraverser, duration_since_start: int | None, *, config: GPUEngine
|
|
245
|
+
) -> None:
|
|
244
246
|
"""
|
|
245
247
|
A post optimization callback that attempts to execute the plan with cudf.
|
|
246
248
|
|
|
@@ -249,8 +251,12 @@ def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None:
|
|
|
249
251
|
nt
|
|
250
252
|
NodeTraverser
|
|
251
253
|
|
|
254
|
+
duration_since_start
|
|
255
|
+
Time since the user started executing the query (or None if no
|
|
256
|
+
profiling should occur).
|
|
257
|
+
|
|
252
258
|
config
|
|
253
|
-
GPUEngine
|
|
259
|
+
GPUEngine object. Configuration is available as ``engine.config``.
|
|
254
260
|
|
|
255
261
|
Raises
|
|
256
262
|
------
|
|
@@ -263,16 +269,27 @@ def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None:
|
|
|
263
269
|
-----
|
|
264
270
|
The NodeTraverser is mutated if the libcudf executor can handle the plan.
|
|
265
271
|
"""
|
|
266
|
-
|
|
272
|
+
if duration_since_start is None:
|
|
273
|
+
timer = None
|
|
274
|
+
else:
|
|
275
|
+
start = time.monotonic_ns()
|
|
276
|
+
timer = Timer(start - duration_since_start)
|
|
277
|
+
|
|
267
278
|
memory_resource = config.memory_resource
|
|
268
|
-
raise_on_fail = config.config.get("raise_on_fail", False)
|
|
269
|
-
executor = config.config.get("executor", None)
|
|
270
|
-
validate_config_options(config.config)
|
|
271
279
|
|
|
272
280
|
with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
|
|
273
281
|
translator = Translator(nt, config)
|
|
274
282
|
ir = translator.translate_ir()
|
|
275
283
|
ir_translation_errors = translator.errors
|
|
284
|
+
if timer is not None:
|
|
285
|
+
timer.store(start, time.monotonic_ns(), "gpu-ir-translation")
|
|
286
|
+
|
|
287
|
+
if (
|
|
288
|
+
memory_resource is None
|
|
289
|
+
and translator.config_options.executor.name == "streaming"
|
|
290
|
+
and translator.config_options.executor.scheduler == "distributed"
|
|
291
|
+
): # pragma: no cover; Requires distributed cluster
|
|
292
|
+
memory_resource = rmm.mr.get_current_device_resource()
|
|
276
293
|
if len(ir_translation_errors):
|
|
277
294
|
# TODO: Display these errors in user-friendly way.
|
|
278
295
|
# tracked in https://github.com/rapidsai/cudf/issues/17051
|
|
@@ -287,15 +304,15 @@ def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None:
|
|
|
287
304
|
exception = NotImplementedError(error_message, unique_errors)
|
|
288
305
|
if bool(int(os.environ.get("POLARS_VERBOSE", 0))):
|
|
289
306
|
warnings.warn(error_message, PerformanceWarning, stacklevel=2)
|
|
290
|
-
if raise_on_fail:
|
|
307
|
+
if translator.config_options.raise_on_fail:
|
|
291
308
|
raise exception
|
|
292
309
|
else:
|
|
293
310
|
nt.set_udf(
|
|
294
311
|
partial(
|
|
295
312
|
_callback,
|
|
296
313
|
ir,
|
|
297
|
-
device=device,
|
|
298
314
|
memory_resource=memory_resource,
|
|
299
|
-
|
|
315
|
+
config_options=translator.config_options,
|
|
316
|
+
timer=timer,
|
|
300
317
|
)
|
|
301
318
|
)
|
cudf_polars/containers/column.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
4
|
"""A column, with some properties."""
|
|
@@ -19,6 +19,7 @@ from pylibcudf.strings.convert.convert_integers import (
|
|
|
19
19
|
)
|
|
20
20
|
from pylibcudf.traits import is_floating_point
|
|
21
21
|
|
|
22
|
+
from cudf_polars.utils import conversion
|
|
22
23
|
from cudf_polars.utils.dtypes import is_order_preserving_cast
|
|
23
24
|
|
|
24
25
|
if TYPE_CHECKING:
|
|
@@ -26,6 +27,8 @@ if TYPE_CHECKING:
|
|
|
26
27
|
|
|
27
28
|
import polars as pl
|
|
28
29
|
|
|
30
|
+
from cudf_polars.typing import ColumnHeader, ColumnOptions, Slice
|
|
31
|
+
|
|
29
32
|
__all__: list[str] = ["Column"]
|
|
30
33
|
|
|
31
34
|
|
|
@@ -51,10 +54,69 @@ class Column:
|
|
|
51
54
|
name: str | None = None,
|
|
52
55
|
):
|
|
53
56
|
self.obj = column
|
|
54
|
-
self.is_scalar = self.
|
|
57
|
+
self.is_scalar = self.size == 1
|
|
55
58
|
self.name = name
|
|
56
59
|
self.set_sorted(is_sorted=is_sorted, order=order, null_order=null_order)
|
|
57
60
|
|
|
61
|
+
@classmethod
|
|
62
|
+
def deserialize(
|
|
63
|
+
cls, header: ColumnHeader, frames: tuple[memoryview, plc.gpumemoryview]
|
|
64
|
+
) -> Self:
|
|
65
|
+
"""
|
|
66
|
+
Create a Column from a serialized representation returned by `.serialize()`.
|
|
67
|
+
|
|
68
|
+
Parameters
|
|
69
|
+
----------
|
|
70
|
+
header
|
|
71
|
+
The (unpickled) metadata required to reconstruct the object.
|
|
72
|
+
frames
|
|
73
|
+
Two-tuple of frames (a memoryview and a gpumemoryview).
|
|
74
|
+
|
|
75
|
+
Returns
|
|
76
|
+
-------
|
|
77
|
+
Column
|
|
78
|
+
The deserialized Column.
|
|
79
|
+
"""
|
|
80
|
+
packed_metadata, packed_gpu_data = frames
|
|
81
|
+
(plc_column,) = plc.contiguous_split.unpack_from_memoryviews(
|
|
82
|
+
packed_metadata, packed_gpu_data
|
|
83
|
+
).columns()
|
|
84
|
+
return cls(plc_column, **header["column_kwargs"])
|
|
85
|
+
|
|
86
|
+
def serialize(
|
|
87
|
+
self,
|
|
88
|
+
) -> tuple[ColumnHeader, tuple[memoryview, plc.gpumemoryview]]:
|
|
89
|
+
"""
|
|
90
|
+
Serialize the Column into header and frames.
|
|
91
|
+
|
|
92
|
+
Follows the Dask serialization scheme with a picklable header (dict) and
|
|
93
|
+
a tuple of frames (in this case a contiguous host and device buffer).
|
|
94
|
+
|
|
95
|
+
To enable dask support, dask serializers must be registered
|
|
96
|
+
|
|
97
|
+
>>> from cudf_polars.experimental.dask_serialize import register
|
|
98
|
+
>>> register()
|
|
99
|
+
|
|
100
|
+
Returns
|
|
101
|
+
-------
|
|
102
|
+
header
|
|
103
|
+
A dict containing any picklable metadata required to reconstruct the object.
|
|
104
|
+
frames
|
|
105
|
+
Two-tuple of frames suitable for passing to `plc.contiguous_split.unpack_from_memoryviews`
|
|
106
|
+
"""
|
|
107
|
+
packed = plc.contiguous_split.pack(plc.Table([self.obj]))
|
|
108
|
+
column_kwargs: ColumnOptions = {
|
|
109
|
+
"is_sorted": self.is_sorted,
|
|
110
|
+
"order": self.order,
|
|
111
|
+
"null_order": self.null_order,
|
|
112
|
+
"name": self.name,
|
|
113
|
+
}
|
|
114
|
+
header: ColumnHeader = {
|
|
115
|
+
"column_kwargs": column_kwargs,
|
|
116
|
+
"frame_count": 2,
|
|
117
|
+
}
|
|
118
|
+
return header, packed.release()
|
|
119
|
+
|
|
58
120
|
@functools.cached_property
|
|
59
121
|
def obj_scalar(self) -> plc.Scalar:
|
|
60
122
|
"""
|
|
@@ -70,9 +132,7 @@ class Column:
|
|
|
70
132
|
If the column is not length-1.
|
|
71
133
|
"""
|
|
72
134
|
if not self.is_scalar:
|
|
73
|
-
raise ValueError(
|
|
74
|
-
f"Cannot convert a column of length {self.obj.size()} to scalar"
|
|
75
|
-
)
|
|
135
|
+
raise ValueError(f"Cannot convert a column of length {self.size} to scalar")
|
|
76
136
|
return plc.copying.get_element(self.obj, 0)
|
|
77
137
|
|
|
78
138
|
def rename(self, name: str | None, /) -> Self:
|
|
@@ -117,6 +177,44 @@ class Column:
|
|
|
117
177
|
null_order=like.null_order,
|
|
118
178
|
)
|
|
119
179
|
|
|
180
|
+
def check_sorted(
|
|
181
|
+
self,
|
|
182
|
+
*,
|
|
183
|
+
order: plc.types.Order,
|
|
184
|
+
null_order: plc.types.NullOrder,
|
|
185
|
+
) -> bool:
|
|
186
|
+
"""
|
|
187
|
+
Check if the column is sorted.
|
|
188
|
+
|
|
189
|
+
Parameters
|
|
190
|
+
----------
|
|
191
|
+
order
|
|
192
|
+
The requested sort order.
|
|
193
|
+
null_order
|
|
194
|
+
Where nulls sort to.
|
|
195
|
+
|
|
196
|
+
Returns
|
|
197
|
+
-------
|
|
198
|
+
True if the column is sorted, false otherwise.
|
|
199
|
+
|
|
200
|
+
Notes
|
|
201
|
+
-----
|
|
202
|
+
If the sortedness flag is not set, this launches a kernel to
|
|
203
|
+
check sortedness.
|
|
204
|
+
"""
|
|
205
|
+
if self.obj.size() <= 1 or self.obj.size() == self.obj.null_count():
|
|
206
|
+
return True
|
|
207
|
+
if self.is_sorted == plc.types.Sorted.YES:
|
|
208
|
+
return self.order == order and (
|
|
209
|
+
self.obj.null_count() == 0 or self.null_order == null_order
|
|
210
|
+
)
|
|
211
|
+
if plc.sorting.is_sorted(plc.Table([self.obj]), [order], [null_order]):
|
|
212
|
+
self.sorted = plc.types.Sorted.YES
|
|
213
|
+
self.order = order
|
|
214
|
+
self.null_order = null_order
|
|
215
|
+
return True
|
|
216
|
+
return False
|
|
217
|
+
|
|
120
218
|
def astype(self, dtype: plc.DataType) -> Column:
|
|
121
219
|
"""
|
|
122
220
|
Cast the column to as the requested dtype.
|
|
@@ -242,7 +340,7 @@ class Column:
|
|
|
242
340
|
-------
|
|
243
341
|
Self with metadata set.
|
|
244
342
|
"""
|
|
245
|
-
if self.
|
|
343
|
+
if self.size <= 1:
|
|
246
344
|
is_sorted = plc.types.Sorted.YES
|
|
247
345
|
self.is_sorted = is_sorted
|
|
248
346
|
self.order = order
|
|
@@ -268,7 +366,7 @@ class Column:
|
|
|
268
366
|
def mask_nans(self) -> Self:
|
|
269
367
|
"""Return a shallow copy of self with nans masked out."""
|
|
270
368
|
if plc.traits.is_floating_point(self.obj.type()):
|
|
271
|
-
old_count = self.
|
|
369
|
+
old_count = self.null_count
|
|
272
370
|
mask, new_count = plc.transform.nans_to_nulls(self.obj)
|
|
273
371
|
result = type(self)(self.obj.with_mask(mask, new_count))
|
|
274
372
|
if old_count == new_count:
|
|
@@ -288,3 +386,36 @@ class Column:
|
|
|
288
386
|
)
|
|
289
387
|
).as_py()
|
|
290
388
|
return 0
|
|
389
|
+
|
|
390
|
+
@property
|
|
391
|
+
def size(self) -> int:
|
|
392
|
+
"""Return the size of the column."""
|
|
393
|
+
return self.obj.size()
|
|
394
|
+
|
|
395
|
+
@property
|
|
396
|
+
def null_count(self) -> int:
|
|
397
|
+
"""Return the number of Null values in the column."""
|
|
398
|
+
return self.obj.null_count()
|
|
399
|
+
|
|
400
|
+
def slice(self, zlice: Slice | None) -> Self:
|
|
401
|
+
"""
|
|
402
|
+
Slice a column.
|
|
403
|
+
|
|
404
|
+
Parameters
|
|
405
|
+
----------
|
|
406
|
+
zlice
|
|
407
|
+
optional, tuple of start and length, negative values of start
|
|
408
|
+
treated as for python indexing. If not provided, returns self.
|
|
409
|
+
|
|
410
|
+
Returns
|
|
411
|
+
-------
|
|
412
|
+
New column (if zlice is not None) otherwise self (if it is)
|
|
413
|
+
"""
|
|
414
|
+
if zlice is None:
|
|
415
|
+
return self
|
|
416
|
+
(table,) = plc.copying.slice(
|
|
417
|
+
plc.Table([self.obj]),
|
|
418
|
+
conversion.from_polars_slice(zlice, num_rows=self.size),
|
|
419
|
+
)
|
|
420
|
+
(column,) = table.columns()
|
|
421
|
+
return type(self)(column, name=self.name).sorted_like(self)
|
|
@@ -1,27 +1,26 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
4
|
"""A dataframe, with some properties."""
|
|
5
5
|
|
|
6
6
|
from __future__ import annotations
|
|
7
7
|
|
|
8
|
-
import pickle
|
|
9
8
|
from functools import cached_property
|
|
10
|
-
from typing import TYPE_CHECKING,
|
|
11
|
-
|
|
12
|
-
import pyarrow as pa
|
|
9
|
+
from typing import TYPE_CHECKING, cast
|
|
13
10
|
|
|
14
11
|
import polars as pl
|
|
15
12
|
|
|
16
13
|
import pylibcudf as plc
|
|
17
14
|
|
|
18
15
|
from cudf_polars.containers import Column
|
|
19
|
-
from cudf_polars.utils import
|
|
16
|
+
from cudf_polars.utils import conversion
|
|
20
17
|
|
|
21
18
|
if TYPE_CHECKING:
|
|
22
19
|
from collections.abc import Iterable, Mapping, Sequence, Set
|
|
23
20
|
|
|
24
|
-
from typing_extensions import Self
|
|
21
|
+
from typing_extensions import Any, Self
|
|
22
|
+
|
|
23
|
+
from cudf_polars.typing import ColumnOptions, DataFrameHeader, Slice
|
|
25
24
|
|
|
26
25
|
|
|
27
26
|
__all__: list[str] = ["DataFrame"]
|
|
@@ -107,17 +106,12 @@ class DataFrame:
|
|
|
107
106
|
-------
|
|
108
107
|
New dataframe representing the input.
|
|
109
108
|
"""
|
|
110
|
-
|
|
111
|
-
schema = table.schema
|
|
112
|
-
for i, field in enumerate(schema):
|
|
113
|
-
schema = schema.set(
|
|
114
|
-
i, pa.field(field.name, dtypes.downcast_arrow_lists(field.type))
|
|
115
|
-
)
|
|
116
|
-
# No-op if the schema is unchanged.
|
|
117
|
-
d_table = plc.interop.from_arrow(table.cast(schema))
|
|
109
|
+
plc_table = plc.Table(df)
|
|
118
110
|
return cls(
|
|
119
|
-
Column(
|
|
120
|
-
for
|
|
111
|
+
Column(d_col, name=name).copy_metadata(h_col)
|
|
112
|
+
for d_col, h_col, name in zip(
|
|
113
|
+
plc_table.columns(), df.iter_columns(), df.columns, strict=True
|
|
114
|
+
)
|
|
121
115
|
)
|
|
122
116
|
|
|
123
117
|
@classmethod
|
|
@@ -150,7 +144,7 @@ class DataFrame:
|
|
|
150
144
|
|
|
151
145
|
@classmethod
|
|
152
146
|
def deserialize(
|
|
153
|
-
cls, header:
|
|
147
|
+
cls, header: DataFrameHeader, frames: tuple[memoryview, plc.gpumemoryview]
|
|
154
148
|
) -> Self:
|
|
155
149
|
"""
|
|
156
150
|
Create a DataFrame from a serialized representation returned by `.serialize()`.
|
|
@@ -178,7 +172,7 @@ class DataFrame:
|
|
|
178
172
|
|
|
179
173
|
def serialize(
|
|
180
174
|
self,
|
|
181
|
-
) -> tuple[
|
|
175
|
+
) -> tuple[DataFrameHeader, tuple[memoryview, plc.gpumemoryview]]:
|
|
182
176
|
"""
|
|
183
177
|
Serialize the table into header and frames.
|
|
184
178
|
|
|
@@ -187,20 +181,20 @@ class DataFrame:
|
|
|
187
181
|
|
|
188
182
|
To enable dask support, dask serializers must be registered
|
|
189
183
|
|
|
190
|
-
|
|
191
|
-
|
|
184
|
+
>>> from cudf_polars.experimental.dask_serialize import register
|
|
185
|
+
>>> register()
|
|
192
186
|
|
|
193
187
|
Returns
|
|
194
188
|
-------
|
|
195
189
|
header
|
|
196
190
|
A dict containing any picklable metadata required to reconstruct the object.
|
|
197
191
|
frames
|
|
198
|
-
Two-tuple of frames suitable for passing to `unpack_from_memoryviews`
|
|
192
|
+
Two-tuple of frames suitable for passing to `plc.contiguous_split.unpack_from_memoryviews`
|
|
199
193
|
"""
|
|
200
194
|
packed = plc.contiguous_split.pack(self.table)
|
|
201
195
|
|
|
202
196
|
# Keyword arguments for `Column.__init__`.
|
|
203
|
-
columns_kwargs = [
|
|
197
|
+
columns_kwargs: list[ColumnOptions] = [
|
|
204
198
|
{
|
|
205
199
|
"is_sorted": col.is_sorted,
|
|
206
200
|
"order": col.order,
|
|
@@ -209,10 +203,8 @@ class DataFrame:
|
|
|
209
203
|
}
|
|
210
204
|
for col in self.columns
|
|
211
205
|
]
|
|
212
|
-
header = {
|
|
206
|
+
header: DataFrameHeader = {
|
|
213
207
|
"columns_kwargs": columns_kwargs,
|
|
214
|
-
# Dask Distributed uses "type-serialized" to dispatch deserialization
|
|
215
|
-
"type-serialized": pickle.dumps(type(self)),
|
|
216
208
|
"frame_count": 2,
|
|
217
209
|
}
|
|
218
210
|
return header, packed.release()
|
|
@@ -247,7 +239,9 @@ class DataFrame:
|
|
|
247
239
|
for c, other in zip(self.columns, like.columns, strict=True)
|
|
248
240
|
)
|
|
249
241
|
|
|
250
|
-
def with_columns(
|
|
242
|
+
def with_columns(
|
|
243
|
+
self, columns: Iterable[Column], *, replace_only: bool = False
|
|
244
|
+
) -> Self:
|
|
251
245
|
"""
|
|
252
246
|
Return a new dataframe with extra columns.
|
|
253
247
|
|
|
@@ -276,7 +270,7 @@ class DataFrame:
|
|
|
276
270
|
"""Drop columns by name."""
|
|
277
271
|
return type(self)(column for column in self.columns if column.name not in names)
|
|
278
272
|
|
|
279
|
-
def select(self, names: Sequence[str]) -> Self:
|
|
273
|
+
def select(self, names: Sequence[str] | Mapping[str, Any]) -> Self:
|
|
280
274
|
"""Select columns by name returning DataFrame."""
|
|
281
275
|
try:
|
|
282
276
|
return type(self)(self.column_map[name] for name in names)
|
|
@@ -296,7 +290,7 @@ class DataFrame:
|
|
|
296
290
|
table = plc.stream_compaction.apply_boolean_mask(self.table, mask.obj)
|
|
297
291
|
return type(self).from_table(table, self.column_names).sorted_like(self)
|
|
298
292
|
|
|
299
|
-
def slice(self, zlice:
|
|
293
|
+
def slice(self, zlice: Slice | None) -> Self:
|
|
300
294
|
"""
|
|
301
295
|
Slice a dataframe.
|
|
302
296
|
|
|
@@ -312,14 +306,7 @@ class DataFrame:
|
|
|
312
306
|
"""
|
|
313
307
|
if zlice is None:
|
|
314
308
|
return self
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
# Polars implementation wraps negative start by num_rows, then
|
|
319
|
-
# adds length to start to get the end, then clamps both to
|
|
320
|
-
# [0, num_rows)
|
|
321
|
-
end = start + length
|
|
322
|
-
start = max(min(start, self.num_rows), 0)
|
|
323
|
-
end = max(min(end, self.num_rows), 0)
|
|
324
|
-
(table,) = plc.copying.slice(self.table, [start, end])
|
|
309
|
+
(table,) = plc.copying.slice(
|
|
310
|
+
self.table, conversion.from_polars_slice(zlice, num_rows=self.num_rows)
|
|
311
|
+
)
|
|
325
312
|
return type(self).from_table(table, self.column_names).sorted_like(self)
|
cudf_polars/dsl/expr.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
# TODO: remove need for this
|
|
4
4
|
# ruff: noqa: D101
|
|
@@ -30,6 +30,7 @@ from cudf_polars.dsl.expressions.datetime import TemporalFunction
|
|
|
30
30
|
from cudf_polars.dsl.expressions.literal import Literal, LiteralColumn
|
|
31
31
|
from cudf_polars.dsl.expressions.rolling import GroupedRollingWindow, RollingWindow
|
|
32
32
|
from cudf_polars.dsl.expressions.selection import Filter, Gather
|
|
33
|
+
from cudf_polars.dsl.expressions.slicing import Slice
|
|
33
34
|
from cudf_polars.dsl.expressions.sorting import Sort, SortBy
|
|
34
35
|
from cudf_polars.dsl.expressions.string import StringFunction
|
|
35
36
|
from cudf_polars.dsl.expressions.ternary import Ternary
|
|
@@ -53,6 +54,7 @@ __all__ = [
|
|
|
53
54
|
"LiteralColumn",
|
|
54
55
|
"NamedExpr",
|
|
55
56
|
"RollingWindow",
|
|
57
|
+
"Slice",
|
|
56
58
|
"Sort",
|
|
57
59
|
"SortBy",
|
|
58
60
|
"StringFunction",
|