cudf-polars-cu12 25.2.2__py3-none-any.whl → 25.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudf_polars/VERSION +1 -1
- cudf_polars/callback.py +85 -53
- cudf_polars/containers/column.py +100 -7
- cudf_polars/containers/dataframe.py +16 -24
- cudf_polars/dsl/expr.py +3 -1
- cudf_polars/dsl/expressions/aggregation.py +3 -3
- cudf_polars/dsl/expressions/binaryop.py +2 -2
- cudf_polars/dsl/expressions/boolean.py +4 -4
- cudf_polars/dsl/expressions/datetime.py +39 -1
- cudf_polars/dsl/expressions/literal.py +3 -9
- cudf_polars/dsl/expressions/selection.py +2 -2
- cudf_polars/dsl/expressions/slicing.py +53 -0
- cudf_polars/dsl/expressions/sorting.py +1 -1
- cudf_polars/dsl/expressions/string.py +4 -4
- cudf_polars/dsl/expressions/unary.py +3 -2
- cudf_polars/dsl/ir.py +222 -93
- cudf_polars/dsl/nodebase.py +8 -1
- cudf_polars/dsl/translate.py +66 -38
- cudf_polars/experimental/base.py +18 -12
- cudf_polars/experimental/dask_serialize.py +22 -8
- cudf_polars/experimental/groupby.py +346 -0
- cudf_polars/experimental/io.py +13 -11
- cudf_polars/experimental/join.py +318 -0
- cudf_polars/experimental/parallel.py +57 -6
- cudf_polars/experimental/shuffle.py +194 -0
- cudf_polars/testing/plugin.py +23 -34
- cudf_polars/typing/__init__.py +33 -2
- cudf_polars/utils/config.py +138 -0
- cudf_polars/utils/conversion.py +40 -0
- cudf_polars/utils/dtypes.py +14 -4
- cudf_polars/utils/timer.py +39 -0
- cudf_polars/utils/versions.py +4 -3
- {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.4.0.dist-info}/METADATA +8 -7
- cudf_polars_cu12-25.4.0.dist-info/RECORD +55 -0
- {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.4.0.dist-info}/WHEEL +1 -1
- cudf_polars_cu12-25.2.2.dist-info/RECORD +0 -48
- {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.4.0.dist-info/licenses}/LICENSE +0 -0
- {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.4.0.dist-info}/top_level.txt +0 -0
cudf_polars/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
25.
|
|
1
|
+
25.04.00
|
cudf_polars/callback.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
4
|
"""Callback for the polars collect function to execute on device."""
|
|
@@ -7,9 +7,10 @@ from __future__ import annotations
|
|
|
7
7
|
|
|
8
8
|
import contextlib
|
|
9
9
|
import os
|
|
10
|
+
import time
|
|
10
11
|
import warnings
|
|
11
12
|
from functools import cache, partial
|
|
12
|
-
from typing import TYPE_CHECKING, Literal
|
|
13
|
+
from typing import TYPE_CHECKING, Literal, overload
|
|
13
14
|
|
|
14
15
|
import nvtx
|
|
15
16
|
|
|
@@ -20,6 +21,8 @@ import rmm
|
|
|
20
21
|
from rmm._cuda import gpu
|
|
21
22
|
|
|
22
23
|
from cudf_polars.dsl.translate import Translator
|
|
24
|
+
from cudf_polars.utils.timer import Timer
|
|
25
|
+
from cudf_polars.utils.versions import POLARS_VERSION_LT_125
|
|
23
26
|
|
|
24
27
|
if TYPE_CHECKING:
|
|
25
28
|
from collections.abc import Generator
|
|
@@ -173,19 +176,53 @@ def set_device(device: int | None) -> Generator[int, None, None]:
|
|
|
173
176
|
gpu.setDevice(previous)
|
|
174
177
|
|
|
175
178
|
|
|
179
|
+
@overload
|
|
176
180
|
def _callback(
|
|
177
181
|
ir: IR,
|
|
178
182
|
with_columns: list[str] | None,
|
|
179
183
|
pyarrow_predicate: str | None,
|
|
180
184
|
n_rows: int | None,
|
|
185
|
+
should_time: Literal[False],
|
|
181
186
|
*,
|
|
182
187
|
device: int | None,
|
|
183
188
|
memory_resource: int | None,
|
|
184
189
|
executor: Literal["pylibcudf", "dask-experimental"] | None,
|
|
185
|
-
|
|
190
|
+
timer: Timer | None,
|
|
191
|
+
) -> pl.DataFrame: ...
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
@overload
|
|
195
|
+
def _callback(
|
|
196
|
+
ir: IR,
|
|
197
|
+
with_columns: list[str] | None,
|
|
198
|
+
pyarrow_predicate: str | None,
|
|
199
|
+
n_rows: int | None,
|
|
200
|
+
should_time: Literal[True],
|
|
201
|
+
*,
|
|
202
|
+
device: int | None,
|
|
203
|
+
memory_resource: int | None,
|
|
204
|
+
executor: Literal["pylibcudf", "dask-experimental"] | None,
|
|
205
|
+
timer: Timer | None,
|
|
206
|
+
) -> tuple[pl.DataFrame, list[tuple[int, int, str]]]: ...
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _callback(
|
|
210
|
+
ir: IR,
|
|
211
|
+
with_columns: list[str] | None,
|
|
212
|
+
pyarrow_predicate: str | None,
|
|
213
|
+
n_rows: int | None,
|
|
214
|
+
should_time: bool, # noqa: FBT001
|
|
215
|
+
*,
|
|
216
|
+
device: int | None,
|
|
217
|
+
memory_resource: int | None,
|
|
218
|
+
executor: Literal["pylibcudf", "dask-experimental"] | None,
|
|
219
|
+
timer: Timer | None,
|
|
220
|
+
):
|
|
186
221
|
assert with_columns is None
|
|
187
222
|
assert pyarrow_predicate is None
|
|
188
223
|
assert n_rows is None
|
|
224
|
+
if timer is not None:
|
|
225
|
+
assert should_time
|
|
189
226
|
with (
|
|
190
227
|
nvtx.annotate(message="ExecuteIR", domain="cudf_polars"),
|
|
191
228
|
# Device must be set before memory resource is obtained.
|
|
@@ -193,7 +230,11 @@ def _callback(
|
|
|
193
230
|
set_memory_resource(memory_resource),
|
|
194
231
|
):
|
|
195
232
|
if executor is None or executor == "pylibcudf":
|
|
196
|
-
|
|
233
|
+
df = ir.evaluate(cache={}, timer=timer).to_polars()
|
|
234
|
+
if timer is None:
|
|
235
|
+
return df
|
|
236
|
+
else:
|
|
237
|
+
return df, timer.timings
|
|
197
238
|
elif executor == "dask-experimental":
|
|
198
239
|
from cudf_polars.experimental.parallel import evaluate_dask
|
|
199
240
|
|
|
@@ -202,45 +243,9 @@ def _callback(
|
|
|
202
243
|
raise ValueError(f"Unknown executor '{executor}'")
|
|
203
244
|
|
|
204
245
|
|
|
205
|
-
def
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
Parameters
|
|
210
|
-
----------
|
|
211
|
-
config
|
|
212
|
-
Configuration options to validate.
|
|
213
|
-
|
|
214
|
-
Raises
|
|
215
|
-
------
|
|
216
|
-
ValueError
|
|
217
|
-
If the configuration contains unsupported options.
|
|
218
|
-
"""
|
|
219
|
-
if unsupported := (
|
|
220
|
-
config.keys()
|
|
221
|
-
- {"raise_on_fail", "parquet_options", "executor", "executor_options"}
|
|
222
|
-
):
|
|
223
|
-
raise ValueError(
|
|
224
|
-
f"Engine configuration contains unsupported settings: {unsupported}"
|
|
225
|
-
)
|
|
226
|
-
assert {"chunked", "chunk_read_limit", "pass_read_limit"}.issuperset(
|
|
227
|
-
config.get("parquet_options", {})
|
|
228
|
-
)
|
|
229
|
-
|
|
230
|
-
# Validate executor_options
|
|
231
|
-
executor = config.get("executor", "pylibcudf")
|
|
232
|
-
if executor == "dask-experimental":
|
|
233
|
-
unsupported = config.get("executor_options", {}).keys() - {
|
|
234
|
-
"max_rows_per_partition",
|
|
235
|
-
"parquet_blocksize",
|
|
236
|
-
}
|
|
237
|
-
else:
|
|
238
|
-
unsupported = config.get("executor_options", {}).keys()
|
|
239
|
-
if unsupported:
|
|
240
|
-
raise ValueError(f"Unsupported executor_options for {executor}: {unsupported}")
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None:
|
|
246
|
+
def execute_with_cudf(
|
|
247
|
+
nt: NodeTraverser, duration_since_start: int | None, *, config: GPUEngine
|
|
248
|
+
) -> None:
|
|
244
249
|
"""
|
|
245
250
|
A post optimization callback that attempts to execute the plan with cudf.
|
|
246
251
|
|
|
@@ -249,6 +254,10 @@ def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None:
|
|
|
249
254
|
nt
|
|
250
255
|
NodeTraverser
|
|
251
256
|
|
|
257
|
+
duration_since_start
|
|
258
|
+
Time since the user started executing the query (or None if no
|
|
259
|
+
profiling should occur).
|
|
260
|
+
|
|
252
261
|
config
|
|
253
262
|
GPUEngine configuration object
|
|
254
263
|
|
|
@@ -263,16 +272,21 @@ def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None:
|
|
|
263
272
|
-----
|
|
264
273
|
The NodeTraverser is mutated if the libcudf executor can handle the plan.
|
|
265
274
|
"""
|
|
275
|
+
if duration_since_start is None:
|
|
276
|
+
timer = None
|
|
277
|
+
else:
|
|
278
|
+
start = time.monotonic_ns()
|
|
279
|
+
timer = Timer(start - duration_since_start)
|
|
266
280
|
device = config.device
|
|
267
281
|
memory_resource = config.memory_resource
|
|
268
282
|
raise_on_fail = config.config.get("raise_on_fail", False)
|
|
269
283
|
executor = config.config.get("executor", None)
|
|
270
|
-
validate_config_options(config.config)
|
|
271
|
-
|
|
272
284
|
with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
|
|
273
285
|
translator = Translator(nt, config)
|
|
274
286
|
ir = translator.translate_ir()
|
|
275
287
|
ir_translation_errors = translator.errors
|
|
288
|
+
if timer is not None:
|
|
289
|
+
timer.store(start, time.monotonic_ns(), "gpu-ir-translation")
|
|
276
290
|
if len(ir_translation_errors):
|
|
277
291
|
# TODO: Display these errors in user-friendly way.
|
|
278
292
|
# tracked in https://github.com/rapidsai/cudf/issues/17051
|
|
@@ -290,12 +304,30 @@ def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None:
|
|
|
290
304
|
if raise_on_fail:
|
|
291
305
|
raise exception
|
|
292
306
|
else:
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
307
|
+
if POLARS_VERSION_LT_125: # pragma: no cover
|
|
308
|
+
nt.set_udf(
|
|
309
|
+
partial(
|
|
310
|
+
_callback,
|
|
311
|
+
ir,
|
|
312
|
+
should_time=False,
|
|
313
|
+
device=device,
|
|
314
|
+
memory_resource=memory_resource,
|
|
315
|
+
executor=executor,
|
|
316
|
+
timer=None,
|
|
317
|
+
)
|
|
300
318
|
)
|
|
301
|
-
|
|
319
|
+
else:
|
|
320
|
+
nt.set_udf(
|
|
321
|
+
partial(
|
|
322
|
+
_callback,
|
|
323
|
+
ir,
|
|
324
|
+
device=device,
|
|
325
|
+
memory_resource=memory_resource,
|
|
326
|
+
executor=executor,
|
|
327
|
+
timer=timer,
|
|
328
|
+
)
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
if POLARS_VERSION_LT_125: # pragma: no cover
|
|
333
|
+
execute_with_cudf = partial(execute_with_cudf, duration_since_start=None)
|
cudf_polars/containers/column.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
4
|
"""A column, with some properties."""
|
|
@@ -19,6 +19,7 @@ from pylibcudf.strings.convert.convert_integers import (
|
|
|
19
19
|
)
|
|
20
20
|
from pylibcudf.traits import is_floating_point
|
|
21
21
|
|
|
22
|
+
from cudf_polars.utils import conversion
|
|
22
23
|
from cudf_polars.utils.dtypes import is_order_preserving_cast
|
|
23
24
|
|
|
24
25
|
if TYPE_CHECKING:
|
|
@@ -26,6 +27,8 @@ if TYPE_CHECKING:
|
|
|
26
27
|
|
|
27
28
|
import polars as pl
|
|
28
29
|
|
|
30
|
+
from cudf_polars.typing import ColumnHeader, ColumnOptions, Slice
|
|
31
|
+
|
|
29
32
|
__all__: list[str] = ["Column"]
|
|
30
33
|
|
|
31
34
|
|
|
@@ -51,10 +54,69 @@ class Column:
|
|
|
51
54
|
name: str | None = None,
|
|
52
55
|
):
|
|
53
56
|
self.obj = column
|
|
54
|
-
self.is_scalar = self.
|
|
57
|
+
self.is_scalar = self.size == 1
|
|
55
58
|
self.name = name
|
|
56
59
|
self.set_sorted(is_sorted=is_sorted, order=order, null_order=null_order)
|
|
57
60
|
|
|
61
|
+
@classmethod
|
|
62
|
+
def deserialize(
|
|
63
|
+
cls, header: ColumnHeader, frames: tuple[memoryview, plc.gpumemoryview]
|
|
64
|
+
) -> Self:
|
|
65
|
+
"""
|
|
66
|
+
Create a Column from a serialized representation returned by `.serialize()`.
|
|
67
|
+
|
|
68
|
+
Parameters
|
|
69
|
+
----------
|
|
70
|
+
header
|
|
71
|
+
The (unpickled) metadata required to reconstruct the object.
|
|
72
|
+
frames
|
|
73
|
+
Two-tuple of frames (a memoryview and a gpumemoryview).
|
|
74
|
+
|
|
75
|
+
Returns
|
|
76
|
+
-------
|
|
77
|
+
Column
|
|
78
|
+
The deserialized Column.
|
|
79
|
+
"""
|
|
80
|
+
packed_metadata, packed_gpu_data = frames
|
|
81
|
+
(plc_column,) = plc.contiguous_split.unpack_from_memoryviews(
|
|
82
|
+
packed_metadata, packed_gpu_data
|
|
83
|
+
).columns()
|
|
84
|
+
return cls(plc_column, **header["column_kwargs"])
|
|
85
|
+
|
|
86
|
+
def serialize(
|
|
87
|
+
self,
|
|
88
|
+
) -> tuple[ColumnHeader, tuple[memoryview, plc.gpumemoryview]]:
|
|
89
|
+
"""
|
|
90
|
+
Serialize the Column into header and frames.
|
|
91
|
+
|
|
92
|
+
Follows the Dask serialization scheme with a picklable header (dict) and
|
|
93
|
+
a tuple of frames (in this case a contiguous host and device buffer).
|
|
94
|
+
|
|
95
|
+
To enable dask support, dask serializers must be registered
|
|
96
|
+
|
|
97
|
+
>>> from cudf_polars.experimental.dask_serialize import register
|
|
98
|
+
>>> register()
|
|
99
|
+
|
|
100
|
+
Returns
|
|
101
|
+
-------
|
|
102
|
+
header
|
|
103
|
+
A dict containing any picklable metadata required to reconstruct the object.
|
|
104
|
+
frames
|
|
105
|
+
Two-tuple of frames suitable for passing to `plc.contiguous_split.unpack_from_memoryviews`
|
|
106
|
+
"""
|
|
107
|
+
packed = plc.contiguous_split.pack(plc.Table([self.obj]))
|
|
108
|
+
column_kwargs: ColumnOptions = {
|
|
109
|
+
"is_sorted": self.is_sorted,
|
|
110
|
+
"order": self.order,
|
|
111
|
+
"null_order": self.null_order,
|
|
112
|
+
"name": self.name,
|
|
113
|
+
}
|
|
114
|
+
header: ColumnHeader = {
|
|
115
|
+
"column_kwargs": column_kwargs,
|
|
116
|
+
"frame_count": 2,
|
|
117
|
+
}
|
|
118
|
+
return header, packed.release()
|
|
119
|
+
|
|
58
120
|
@functools.cached_property
|
|
59
121
|
def obj_scalar(self) -> plc.Scalar:
|
|
60
122
|
"""
|
|
@@ -70,9 +132,7 @@ class Column:
|
|
|
70
132
|
If the column is not length-1.
|
|
71
133
|
"""
|
|
72
134
|
if not self.is_scalar:
|
|
73
|
-
raise ValueError(
|
|
74
|
-
f"Cannot convert a column of length {self.obj.size()} to scalar"
|
|
75
|
-
)
|
|
135
|
+
raise ValueError(f"Cannot convert a column of length {self.size} to scalar")
|
|
76
136
|
return plc.copying.get_element(self.obj, 0)
|
|
77
137
|
|
|
78
138
|
def rename(self, name: str | None, /) -> Self:
|
|
@@ -242,7 +302,7 @@ class Column:
|
|
|
242
302
|
-------
|
|
243
303
|
Self with metadata set.
|
|
244
304
|
"""
|
|
245
|
-
if self.
|
|
305
|
+
if self.size <= 1:
|
|
246
306
|
is_sorted = plc.types.Sorted.YES
|
|
247
307
|
self.is_sorted = is_sorted
|
|
248
308
|
self.order = order
|
|
@@ -268,7 +328,7 @@ class Column:
|
|
|
268
328
|
def mask_nans(self) -> Self:
|
|
269
329
|
"""Return a shallow copy of self with nans masked out."""
|
|
270
330
|
if plc.traits.is_floating_point(self.obj.type()):
|
|
271
|
-
old_count = self.
|
|
331
|
+
old_count = self.null_count
|
|
272
332
|
mask, new_count = plc.transform.nans_to_nulls(self.obj)
|
|
273
333
|
result = type(self)(self.obj.with_mask(mask, new_count))
|
|
274
334
|
if old_count == new_count:
|
|
@@ -288,3 +348,36 @@ class Column:
|
|
|
288
348
|
)
|
|
289
349
|
).as_py()
|
|
290
350
|
return 0
|
|
351
|
+
|
|
352
|
+
@property
|
|
353
|
+
def size(self) -> int:
|
|
354
|
+
"""Return the size of the column."""
|
|
355
|
+
return self.obj.size()
|
|
356
|
+
|
|
357
|
+
@property
|
|
358
|
+
def null_count(self) -> int:
|
|
359
|
+
"""Return the number of Null values in the column."""
|
|
360
|
+
return self.obj.null_count()
|
|
361
|
+
|
|
362
|
+
def slice(self, zlice: Slice | None) -> Self:
|
|
363
|
+
"""
|
|
364
|
+
Slice a column.
|
|
365
|
+
|
|
366
|
+
Parameters
|
|
367
|
+
----------
|
|
368
|
+
zlice
|
|
369
|
+
optional, tuple of start and length, negative values of start
|
|
370
|
+
treated as for python indexing. If not provided, returns self.
|
|
371
|
+
|
|
372
|
+
Returns
|
|
373
|
+
-------
|
|
374
|
+
New column (if zlice is not None) otherwise self (if it is)
|
|
375
|
+
"""
|
|
376
|
+
if zlice is None:
|
|
377
|
+
return self
|
|
378
|
+
(table,) = plc.copying.slice(
|
|
379
|
+
plc.Table([self.obj]),
|
|
380
|
+
conversion.from_polars_slice(zlice, num_rows=self.size),
|
|
381
|
+
)
|
|
382
|
+
(column,) = table.columns()
|
|
383
|
+
return type(self)(column, name=self.name).sorted_like(self)
|
|
@@ -1,13 +1,12 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
4
|
"""A dataframe, with some properties."""
|
|
5
5
|
|
|
6
6
|
from __future__ import annotations
|
|
7
7
|
|
|
8
|
-
import pickle
|
|
9
8
|
from functools import cached_property
|
|
10
|
-
from typing import TYPE_CHECKING,
|
|
9
|
+
from typing import TYPE_CHECKING, cast
|
|
11
10
|
|
|
12
11
|
import pyarrow as pa
|
|
13
12
|
|
|
@@ -16,13 +15,15 @@ import polars as pl
|
|
|
16
15
|
import pylibcudf as plc
|
|
17
16
|
|
|
18
17
|
from cudf_polars.containers import Column
|
|
19
|
-
from cudf_polars.utils import dtypes
|
|
18
|
+
from cudf_polars.utils import conversion, dtypes
|
|
20
19
|
|
|
21
20
|
if TYPE_CHECKING:
|
|
22
21
|
from collections.abc import Iterable, Mapping, Sequence, Set
|
|
23
22
|
|
|
24
23
|
from typing_extensions import Self
|
|
25
24
|
|
|
25
|
+
from cudf_polars.typing import ColumnOptions, DataFrameHeader, Slice
|
|
26
|
+
|
|
26
27
|
|
|
27
28
|
__all__: list[str] = ["DataFrame"]
|
|
28
29
|
|
|
@@ -150,7 +151,7 @@ class DataFrame:
|
|
|
150
151
|
|
|
151
152
|
@classmethod
|
|
152
153
|
def deserialize(
|
|
153
|
-
cls, header:
|
|
154
|
+
cls, header: DataFrameHeader, frames: tuple[memoryview, plc.gpumemoryview]
|
|
154
155
|
) -> Self:
|
|
155
156
|
"""
|
|
156
157
|
Create a DataFrame from a serialized representation returned by `.serialize()`.
|
|
@@ -178,7 +179,7 @@ class DataFrame:
|
|
|
178
179
|
|
|
179
180
|
def serialize(
|
|
180
181
|
self,
|
|
181
|
-
) -> tuple[
|
|
182
|
+
) -> tuple[DataFrameHeader, tuple[memoryview, plc.gpumemoryview]]:
|
|
182
183
|
"""
|
|
183
184
|
Serialize the table into header and frames.
|
|
184
185
|
|
|
@@ -187,20 +188,20 @@ class DataFrame:
|
|
|
187
188
|
|
|
188
189
|
To enable dask support, dask serializers must be registered
|
|
189
190
|
|
|
190
|
-
|
|
191
|
-
|
|
191
|
+
>>> from cudf_polars.experimental.dask_serialize import register
|
|
192
|
+
>>> register()
|
|
192
193
|
|
|
193
194
|
Returns
|
|
194
195
|
-------
|
|
195
196
|
header
|
|
196
197
|
A dict containing any picklable metadata required to reconstruct the object.
|
|
197
198
|
frames
|
|
198
|
-
Two-tuple of frames suitable for passing to `unpack_from_memoryviews`
|
|
199
|
+
Two-tuple of frames suitable for passing to `plc.contiguous_split.unpack_from_memoryviews`
|
|
199
200
|
"""
|
|
200
201
|
packed = plc.contiguous_split.pack(self.table)
|
|
201
202
|
|
|
202
203
|
# Keyword arguments for `Column.__init__`.
|
|
203
|
-
columns_kwargs = [
|
|
204
|
+
columns_kwargs: list[ColumnOptions] = [
|
|
204
205
|
{
|
|
205
206
|
"is_sorted": col.is_sorted,
|
|
206
207
|
"order": col.order,
|
|
@@ -209,10 +210,8 @@ class DataFrame:
|
|
|
209
210
|
}
|
|
210
211
|
for col in self.columns
|
|
211
212
|
]
|
|
212
|
-
header = {
|
|
213
|
+
header: DataFrameHeader = {
|
|
213
214
|
"columns_kwargs": columns_kwargs,
|
|
214
|
-
# Dask Distributed uses "type-serialized" to dispatch deserialization
|
|
215
|
-
"type-serialized": pickle.dumps(type(self)),
|
|
216
215
|
"frame_count": 2,
|
|
217
216
|
}
|
|
218
217
|
return header, packed.release()
|
|
@@ -296,7 +295,7 @@ class DataFrame:
|
|
|
296
295
|
table = plc.stream_compaction.apply_boolean_mask(self.table, mask.obj)
|
|
297
296
|
return type(self).from_table(table, self.column_names).sorted_like(self)
|
|
298
297
|
|
|
299
|
-
def slice(self, zlice:
|
|
298
|
+
def slice(self, zlice: Slice | None) -> Self:
|
|
300
299
|
"""
|
|
301
300
|
Slice a dataframe.
|
|
302
301
|
|
|
@@ -312,14 +311,7 @@ class DataFrame:
|
|
|
312
311
|
"""
|
|
313
312
|
if zlice is None:
|
|
314
313
|
return self
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
# Polars implementation wraps negative start by num_rows, then
|
|
319
|
-
# adds length to start to get the end, then clamps both to
|
|
320
|
-
# [0, num_rows)
|
|
321
|
-
end = start + length
|
|
322
|
-
start = max(min(start, self.num_rows), 0)
|
|
323
|
-
end = max(min(end, self.num_rows), 0)
|
|
324
|
-
(table,) = plc.copying.slice(self.table, [start, end])
|
|
314
|
+
(table,) = plc.copying.slice(
|
|
315
|
+
self.table, conversion.from_polars_slice(zlice, num_rows=self.num_rows)
|
|
316
|
+
)
|
|
325
317
|
return type(self).from_table(table, self.column_names).sorted_like(self)
|
cudf_polars/dsl/expr.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
# TODO: remove need for this
|
|
4
4
|
# ruff: noqa: D101
|
|
@@ -30,6 +30,7 @@ from cudf_polars.dsl.expressions.datetime import TemporalFunction
|
|
|
30
30
|
from cudf_polars.dsl.expressions.literal import Literal, LiteralColumn
|
|
31
31
|
from cudf_polars.dsl.expressions.rolling import GroupedRollingWindow, RollingWindow
|
|
32
32
|
from cudf_polars.dsl.expressions.selection import Filter, Gather
|
|
33
|
+
from cudf_polars.dsl.expressions.slicing import Slice
|
|
33
34
|
from cudf_polars.dsl.expressions.sorting import Sort, SortBy
|
|
34
35
|
from cudf_polars.dsl.expressions.string import StringFunction
|
|
35
36
|
from cudf_polars.dsl.expressions.ternary import Ternary
|
|
@@ -53,6 +54,7 @@ __all__ = [
|
|
|
53
54
|
"LiteralColumn",
|
|
54
55
|
"NamedExpr",
|
|
55
56
|
"RollingWindow",
|
|
57
|
+
"Slice",
|
|
56
58
|
"Sort",
|
|
57
59
|
"SortBy",
|
|
58
60
|
"StringFunction",
|
|
@@ -172,7 +172,7 @@ class Agg(Expr):
|
|
|
172
172
|
plc.Column.from_scalar(
|
|
173
173
|
plc.interop.from_arrow(
|
|
174
174
|
pa.scalar(
|
|
175
|
-
column.
|
|
175
|
+
column.size - column.null_count,
|
|
176
176
|
type=plc.interop.to_arrow(self.dtype),
|
|
177
177
|
),
|
|
178
178
|
),
|
|
@@ -181,7 +181,7 @@ class Agg(Expr):
|
|
|
181
181
|
)
|
|
182
182
|
|
|
183
183
|
def _sum(self, column: Column) -> Column:
|
|
184
|
-
if column.
|
|
184
|
+
if column.size == 0 or column.null_count == column.size:
|
|
185
185
|
return Column(
|
|
186
186
|
plc.Column.from_scalar(
|
|
187
187
|
plc.interop.from_arrow(
|
|
@@ -224,7 +224,7 @@ class Agg(Expr):
|
|
|
224
224
|
return Column(plc.copying.slice(column.obj, [0, 1])[0])
|
|
225
225
|
|
|
226
226
|
def _last(self, column: Column) -> Column:
|
|
227
|
-
n = column.
|
|
227
|
+
n = column.size
|
|
228
228
|
return Column(plc.copying.slice(column.obj, [n - 1, n])[0])
|
|
229
229
|
|
|
230
230
|
def do_evaluate(
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
# TODO: remove need for this
|
|
4
4
|
# ruff: noqa: D101
|
|
@@ -98,7 +98,7 @@ class BinOp(Expr):
|
|
|
98
98
|
)
|
|
99
99
|
lop = left.obj
|
|
100
100
|
rop = right.obj
|
|
101
|
-
if left.
|
|
101
|
+
if left.size != right.size:
|
|
102
102
|
if left.is_scalar:
|
|
103
103
|
lop = left.obj_scalar
|
|
104
104
|
elif right.is_scalar:
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
# TODO: remove need for this
|
|
4
4
|
# ruff: noqa: D101
|
|
@@ -191,7 +191,7 @@ class BooleanFunction(Expr):
|
|
|
191
191
|
is_any = self.name is BooleanFunction.Name.Any
|
|
192
192
|
agg = plc.aggregation.any() if is_any else plc.aggregation.all()
|
|
193
193
|
result = plc.reduce.reduce(column.obj, agg, self.dtype)
|
|
194
|
-
if not ignore_nulls and column.
|
|
194
|
+
if not ignore_nulls and column.null_count > 0:
|
|
195
195
|
# Truth tables
|
|
196
196
|
# Any All
|
|
197
197
|
# | F U T | F U T
|
|
@@ -218,14 +218,14 @@ class BooleanFunction(Expr):
|
|
|
218
218
|
(column,) = columns
|
|
219
219
|
return Column(
|
|
220
220
|
plc.unary.is_nan(column.obj).with_mask(
|
|
221
|
-
column.obj.null_mask(), column.
|
|
221
|
+
column.obj.null_mask(), column.null_count
|
|
222
222
|
)
|
|
223
223
|
)
|
|
224
224
|
elif self.name is BooleanFunction.Name.IsNotNan:
|
|
225
225
|
(column,) = columns
|
|
226
226
|
return Column(
|
|
227
227
|
plc.unary.is_not_nan(column.obj).with_mask(
|
|
228
|
-
column.obj.null_mask(), column.
|
|
228
|
+
column.obj.null_mask(), column.null_count
|
|
229
229
|
)
|
|
230
230
|
)
|
|
231
231
|
elif self.name is BooleanFunction.Name.IsFirstDistinct:
|
|
@@ -104,6 +104,14 @@ class TemporalFunction(Expr):
|
|
|
104
104
|
Name.Nanosecond: plc.datetime.DatetimeComponent.NANOSECOND,
|
|
105
105
|
}
|
|
106
106
|
|
|
107
|
+
_valid_ops: ClassVar[set[Name]] = {
|
|
108
|
+
*_COMPONENT_MAP.keys(),
|
|
109
|
+
Name.IsLeapYear,
|
|
110
|
+
Name.OrdinalDay,
|
|
111
|
+
Name.MonthStart,
|
|
112
|
+
Name.MonthEnd,
|
|
113
|
+
}
|
|
114
|
+
|
|
107
115
|
def __init__(
|
|
108
116
|
self,
|
|
109
117
|
dtype: plc.DataType,
|
|
@@ -116,7 +124,7 @@ class TemporalFunction(Expr):
|
|
|
116
124
|
self.name = name
|
|
117
125
|
self.children = children
|
|
118
126
|
self.is_pointwise = True
|
|
119
|
-
if self.name not in self.
|
|
127
|
+
if self.name not in self._valid_ops:
|
|
120
128
|
raise NotImplementedError(f"Temporal function {self.name}")
|
|
121
129
|
|
|
122
130
|
def do_evaluate(
|
|
@@ -132,6 +140,36 @@ class TemporalFunction(Expr):
|
|
|
132
140
|
for child in self.children
|
|
133
141
|
]
|
|
134
142
|
(column,) = columns
|
|
143
|
+
if self.name is TemporalFunction.Name.MonthStart:
|
|
144
|
+
ends = plc.datetime.last_day_of_month(column.obj)
|
|
145
|
+
days_to_subtract = plc.datetime.days_in_month(column.obj)
|
|
146
|
+
# must subtract 1 to avoid rolling over to the previous month
|
|
147
|
+
days_to_subtract = plc.binaryop.binary_operation(
|
|
148
|
+
days_to_subtract,
|
|
149
|
+
plc.interop.from_arrow(pa.scalar(1, type=pa.int32())),
|
|
150
|
+
plc.binaryop.BinaryOperator.SUB,
|
|
151
|
+
plc.DataType(plc.TypeId.DURATION_DAYS),
|
|
152
|
+
)
|
|
153
|
+
result = plc.binaryop.binary_operation(
|
|
154
|
+
ends,
|
|
155
|
+
days_to_subtract,
|
|
156
|
+
plc.binaryop.BinaryOperator.SUB,
|
|
157
|
+
column.obj.type(),
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
return Column(result)
|
|
161
|
+
if self.name is TemporalFunction.Name.MonthEnd:
|
|
162
|
+
return Column(
|
|
163
|
+
plc.unary.cast(
|
|
164
|
+
plc.datetime.last_day_of_month(column.obj), column.obj.type()
|
|
165
|
+
)
|
|
166
|
+
)
|
|
167
|
+
if self.name is TemporalFunction.Name.IsLeapYear:
|
|
168
|
+
return Column(
|
|
169
|
+
plc.datetime.is_leap_year(column.obj),
|
|
170
|
+
)
|
|
171
|
+
if self.name is TemporalFunction.Name.OrdinalDay:
|
|
172
|
+
return Column(plc.datetime.day_of_year(column.obj))
|
|
135
173
|
if self.name is TemporalFunction.Name.Microsecond:
|
|
136
174
|
millis = plc.datetime.extract_datetime_component(
|
|
137
175
|
column.obj, plc.datetime.DatetimeComponent.MILLISECOND
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
# TODO: remove need for this
|
|
4
4
|
# ruff: noqa: D101
|
|
@@ -8,21 +8,16 @@ from __future__ import annotations
|
|
|
8
8
|
|
|
9
9
|
from typing import TYPE_CHECKING, Any
|
|
10
10
|
|
|
11
|
-
import pyarrow as pa
|
|
12
|
-
|
|
13
11
|
import pylibcudf as plc
|
|
14
12
|
|
|
15
13
|
from cudf_polars.containers import Column
|
|
16
14
|
from cudf_polars.dsl.expressions.base import AggInfo, ExecutionContext, Expr
|
|
17
|
-
from cudf_polars.utils import dtypes
|
|
18
15
|
|
|
19
16
|
if TYPE_CHECKING:
|
|
20
17
|
from collections.abc import Hashable, Mapping
|
|
21
18
|
|
|
22
19
|
import pyarrow as pa
|
|
23
20
|
|
|
24
|
-
import polars as pl
|
|
25
|
-
|
|
26
21
|
from cudf_polars.containers import DataFrame
|
|
27
22
|
|
|
28
23
|
__all__ = ["Literal", "LiteralColumn"]
|
|
@@ -61,10 +56,9 @@ class LiteralColumn(Expr):
|
|
|
61
56
|
_non_child = ("dtype", "value")
|
|
62
57
|
value: pa.Array[Any]
|
|
63
58
|
|
|
64
|
-
def __init__(self, dtype: plc.DataType, value:
|
|
59
|
+
def __init__(self, dtype: plc.DataType, value: pa.Array) -> None:
|
|
65
60
|
self.dtype = dtype
|
|
66
|
-
|
|
67
|
-
self.value = data.cast(dtypes.downcast_arrow_lists(data.type))
|
|
61
|
+
self.value = value
|
|
68
62
|
self.children = ()
|
|
69
63
|
self.is_pointwise = True
|
|
70
64
|
|