cudf-polars-cu12 25.2.2__py3-none-any.whl → 25.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudf_polars/VERSION +1 -1
- cudf_polars/callback.py +82 -65
- cudf_polars/containers/column.py +138 -7
- cudf_polars/containers/dataframe.py +26 -39
- cudf_polars/dsl/expr.py +3 -1
- cudf_polars/dsl/expressions/aggregation.py +27 -63
- cudf_polars/dsl/expressions/base.py +40 -72
- cudf_polars/dsl/expressions/binaryop.py +5 -41
- cudf_polars/dsl/expressions/boolean.py +25 -53
- cudf_polars/dsl/expressions/datetime.py +97 -17
- cudf_polars/dsl/expressions/literal.py +27 -33
- cudf_polars/dsl/expressions/rolling.py +110 -9
- cudf_polars/dsl/expressions/selection.py +8 -26
- cudf_polars/dsl/expressions/slicing.py +47 -0
- cudf_polars/dsl/expressions/sorting.py +5 -18
- cudf_polars/dsl/expressions/string.py +33 -36
- cudf_polars/dsl/expressions/ternary.py +3 -10
- cudf_polars/dsl/expressions/unary.py +35 -75
- cudf_polars/dsl/ir.py +749 -212
- cudf_polars/dsl/nodebase.py +8 -1
- cudf_polars/dsl/to_ast.py +5 -3
- cudf_polars/dsl/translate.py +319 -171
- cudf_polars/dsl/utils/__init__.py +8 -0
- cudf_polars/dsl/utils/aggregations.py +292 -0
- cudf_polars/dsl/utils/groupby.py +97 -0
- cudf_polars/dsl/utils/naming.py +34 -0
- cudf_polars/dsl/utils/replace.py +46 -0
- cudf_polars/dsl/utils/rolling.py +113 -0
- cudf_polars/dsl/utils/windows.py +186 -0
- cudf_polars/experimental/base.py +17 -19
- cudf_polars/experimental/benchmarks/__init__.py +4 -0
- cudf_polars/experimental/benchmarks/pdsh.py +1279 -0
- cudf_polars/experimental/dask_registers.py +196 -0
- cudf_polars/experimental/distinct.py +174 -0
- cudf_polars/experimental/explain.py +127 -0
- cudf_polars/experimental/expressions.py +521 -0
- cudf_polars/experimental/groupby.py +288 -0
- cudf_polars/experimental/io.py +58 -29
- cudf_polars/experimental/join.py +353 -0
- cudf_polars/experimental/parallel.py +166 -93
- cudf_polars/experimental/repartition.py +69 -0
- cudf_polars/experimental/scheduler.py +155 -0
- cudf_polars/experimental/select.py +92 -7
- cudf_polars/experimental/shuffle.py +294 -0
- cudf_polars/experimental/sort.py +45 -0
- cudf_polars/experimental/spilling.py +151 -0
- cudf_polars/experimental/utils.py +100 -0
- cudf_polars/testing/asserts.py +146 -6
- cudf_polars/testing/io.py +72 -0
- cudf_polars/testing/plugin.py +78 -76
- cudf_polars/typing/__init__.py +59 -6
- cudf_polars/utils/config.py +353 -0
- cudf_polars/utils/conversion.py +40 -0
- cudf_polars/utils/dtypes.py +22 -5
- cudf_polars/utils/timer.py +39 -0
- cudf_polars/utils/versions.py +5 -4
- {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/METADATA +10 -7
- cudf_polars_cu12-25.6.0.dist-info/RECORD +73 -0
- {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/WHEEL +1 -1
- cudf_polars/experimental/dask_serialize.py +0 -59
- cudf_polars_cu12-25.2.2.dist-info/RECORD +0 -48
- {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info/licenses}/LICENSE +0 -0
- {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
"""Utilities for rolling window aggregations."""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
9
|
+
|
|
10
|
+
import polars as pl
|
|
11
|
+
|
|
12
|
+
import pylibcudf as plc
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from cudf_polars.typing import ClosedInterval, Duration
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"duration_to_int",
|
|
20
|
+
"duration_to_scalar",
|
|
21
|
+
"offsets_to_windows",
|
|
22
|
+
"range_window_bounds",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def duration_to_int(
|
|
27
|
+
dtype: plc.DataType,
|
|
28
|
+
months: int,
|
|
29
|
+
weeks: int,
|
|
30
|
+
days: int,
|
|
31
|
+
nanoseconds: int,
|
|
32
|
+
parsed_int: bool, # noqa: FBT001
|
|
33
|
+
negative: bool, # noqa: FBT001
|
|
34
|
+
) -> int:
|
|
35
|
+
"""
|
|
36
|
+
Convert a polars duration value to an integer.
|
|
37
|
+
|
|
38
|
+
Parameters
|
|
39
|
+
----------
|
|
40
|
+
dtype
|
|
41
|
+
The type of the column being added to.
|
|
42
|
+
months
|
|
43
|
+
Number of months
|
|
44
|
+
weeks
|
|
45
|
+
Number of weeks
|
|
46
|
+
days
|
|
47
|
+
Number of days
|
|
48
|
+
nanoseconds
|
|
49
|
+
Number of nanoseconds
|
|
50
|
+
parsed_int
|
|
51
|
+
Is this actually a representation of an integer, not a duration?
|
|
52
|
+
negative
|
|
53
|
+
Is this a negative duration?
|
|
54
|
+
|
|
55
|
+
Returns
|
|
56
|
+
-------
|
|
57
|
+
int
|
|
58
|
+
The total number of nanoseconds represented by this duration,
|
|
59
|
+
or just an integer if `parsed_int` was true.
|
|
60
|
+
|
|
61
|
+
Raises
|
|
62
|
+
------
|
|
63
|
+
NotImplementedError
|
|
64
|
+
For unsupported durations or datatypes.
|
|
65
|
+
"""
|
|
66
|
+
if months != 0:
|
|
67
|
+
raise NotImplementedError("Month durations in rolling windows")
|
|
68
|
+
if parsed_int and (weeks != 0 or days != 0 or dtype.id() != plc.TypeId.INT64):
|
|
69
|
+
raise NotImplementedError(
|
|
70
|
+
"Invalid duration for parsed_int"
|
|
71
|
+
) # pragma: no cover; polars raises first
|
|
72
|
+
elif not parsed_int and dtype.id() == plc.TypeId.INT64:
|
|
73
|
+
raise pl.exceptions.InvalidOperationError("Duration must be a parsed integer")
|
|
74
|
+
value = nanoseconds + 24 * 60 * 60 * 10**9 * (days + 7 * weeks)
|
|
75
|
+
return -value if negative else value
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def duration_to_scalar(dtype: plc.DataType, value: int) -> plc.Scalar:
|
|
79
|
+
"""
|
|
80
|
+
Convert a raw polars duration value to a pylibcudf scalar.
|
|
81
|
+
|
|
82
|
+
Parameters
|
|
83
|
+
----------
|
|
84
|
+
dtype
|
|
85
|
+
The type of the column being added to.
|
|
86
|
+
value
|
|
87
|
+
The raw value as in integer. If `dtype` represents a timestamp
|
|
88
|
+
type, this should be in nanoseconds.
|
|
89
|
+
|
|
90
|
+
Returns
|
|
91
|
+
-------
|
|
92
|
+
pylibcudf.Scalar
|
|
93
|
+
With datatype matching the provided dtype.
|
|
94
|
+
|
|
95
|
+
Raises
|
|
96
|
+
------
|
|
97
|
+
NotImplementedError
|
|
98
|
+
For unsupported durations or datatypes.
|
|
99
|
+
"""
|
|
100
|
+
tid = dtype.id()
|
|
101
|
+
if tid == plc.TypeId.INT64:
|
|
102
|
+
return plc.Scalar.from_py(value, dtype)
|
|
103
|
+
elif tid == plc.TypeId.TIMESTAMP_NANOSECONDS:
|
|
104
|
+
return plc.Scalar.from_py(value, plc.DataType(plc.TypeId.DURATION_NANOSECONDS))
|
|
105
|
+
elif tid == plc.TypeId.TIMESTAMP_MICROSECONDS:
|
|
106
|
+
return plc.Scalar.from_py(
|
|
107
|
+
value // 1000, plc.DataType(plc.TypeId.DURATION_MICROSECONDS)
|
|
108
|
+
)
|
|
109
|
+
elif tid == plc.TypeId.TIMESTAMP_MILLISECONDS:
|
|
110
|
+
return plc.Scalar.from_py(
|
|
111
|
+
value // 1_000_000, plc.DataType(plc.TypeId.DURATION_MILLISECONDS)
|
|
112
|
+
)
|
|
113
|
+
else:
|
|
114
|
+
raise NotImplementedError("Unsupported data type in rolling window offset")
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def offsets_to_windows(
|
|
118
|
+
dtype: plc.DataType,
|
|
119
|
+
offset: Duration,
|
|
120
|
+
period: Duration,
|
|
121
|
+
) -> tuple[plc.Scalar, plc.Scalar]:
|
|
122
|
+
"""
|
|
123
|
+
Convert polars offset/period pair to preceding/following windows.
|
|
124
|
+
|
|
125
|
+
Parameters
|
|
126
|
+
----------
|
|
127
|
+
dtype
|
|
128
|
+
Datatype of column defining windows
|
|
129
|
+
offset
|
|
130
|
+
Offset duration
|
|
131
|
+
period
|
|
132
|
+
Period of window
|
|
133
|
+
|
|
134
|
+
Returns
|
|
135
|
+
-------
|
|
136
|
+
tuple of preceding and following windows as pyarrow scalars.
|
|
137
|
+
"""
|
|
138
|
+
offset_i = duration_to_int(dtype, *offset)
|
|
139
|
+
period_i = duration_to_int(dtype, *period)
|
|
140
|
+
# Polars uses current_row + offset, ..., current_row + offset + period
|
|
141
|
+
# Libcudf uses current_row - preceding, ..., current_row + following
|
|
142
|
+
return duration_to_scalar(dtype, -offset_i), duration_to_scalar(
|
|
143
|
+
dtype, offset_i + period_i
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def range_window_bounds(
|
|
148
|
+
preceding: plc.Scalar, following: plc.Scalar, closed_window: ClosedInterval
|
|
149
|
+
) -> tuple[plc.rolling.RangeWindowType, plc.rolling.RangeWindowType]:
|
|
150
|
+
"""
|
|
151
|
+
Convert preceding and following scalars to range window specs.
|
|
152
|
+
|
|
153
|
+
Parameters
|
|
154
|
+
----------
|
|
155
|
+
preceding
|
|
156
|
+
The preceding window scalar.
|
|
157
|
+
following
|
|
158
|
+
The following window scalar.
|
|
159
|
+
closed_window
|
|
160
|
+
How the window interval endpoints are treated.
|
|
161
|
+
|
|
162
|
+
Returns
|
|
163
|
+
-------
|
|
164
|
+
tuple
|
|
165
|
+
Of preceding and following range window types.
|
|
166
|
+
"""
|
|
167
|
+
if closed_window == "both":
|
|
168
|
+
return (
|
|
169
|
+
plc.rolling.BoundedClosed(preceding),
|
|
170
|
+
plc.rolling.BoundedClosed(following),
|
|
171
|
+
)
|
|
172
|
+
elif closed_window == "left":
|
|
173
|
+
return (
|
|
174
|
+
plc.rolling.BoundedClosed(preceding),
|
|
175
|
+
plc.rolling.BoundedOpen(following),
|
|
176
|
+
)
|
|
177
|
+
elif closed_window == "right":
|
|
178
|
+
return (
|
|
179
|
+
plc.rolling.BoundedOpen(preceding),
|
|
180
|
+
plc.rolling.BoundedClosed(following),
|
|
181
|
+
)
|
|
182
|
+
else:
|
|
183
|
+
return (
|
|
184
|
+
plc.rolling.BoundedOpen(preceding),
|
|
185
|
+
plc.rolling.BoundedOpen(following),
|
|
186
|
+
)
|
cudf_polars/experimental/base.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
"""Multi-partition base classes."""
|
|
4
4
|
|
|
@@ -6,26 +6,29 @@ from __future__ import annotations
|
|
|
6
6
|
|
|
7
7
|
from typing import TYPE_CHECKING
|
|
8
8
|
|
|
9
|
-
from cudf_polars.dsl.ir import Union
|
|
10
|
-
|
|
11
9
|
if TYPE_CHECKING:
|
|
12
|
-
from collections.abc import Iterator
|
|
10
|
+
from collections.abc import Iterator
|
|
13
11
|
|
|
14
|
-
from cudf_polars.
|
|
12
|
+
from cudf_polars.dsl.expr import NamedExpr
|
|
15
13
|
from cudf_polars.dsl.nodebase import Node
|
|
16
14
|
|
|
17
15
|
|
|
18
16
|
class PartitionInfo:
|
|
19
|
-
"""
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
"""
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
def __init__(
|
|
17
|
+
"""Partitioning information."""
|
|
18
|
+
|
|
19
|
+
__slots__ = ("count", "partitioned_on")
|
|
20
|
+
count: int
|
|
21
|
+
"""Partition count."""
|
|
22
|
+
partitioned_on: tuple[NamedExpr, ...]
|
|
23
|
+
"""Columns the data is hash-partitioned on."""
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
count: int,
|
|
28
|
+
partitioned_on: tuple[NamedExpr, ...] = (),
|
|
29
|
+
):
|
|
28
30
|
self.count = count
|
|
31
|
+
self.partitioned_on = partitioned_on
|
|
29
32
|
|
|
30
33
|
def keys(self, node: Node) -> Iterator[tuple[str, int]]:
|
|
31
34
|
"""Return the partitioned keys for a given node."""
|
|
@@ -36,8 +39,3 @@ class PartitionInfo:
|
|
|
36
39
|
def get_key_name(node: Node) -> str:
|
|
37
40
|
"""Generate the key name for a Node."""
|
|
38
41
|
return f"{type(node).__name__.lower()}-{hash(node)}"
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
def _concat(dfs: Sequence[DataFrame]) -> DataFrame:
|
|
42
|
-
# Concatenate a sequence of DataFrames vertically
|
|
43
|
-
return Union.do_evaluate(None, *dfs)
|