cudf-polars-cu13 25.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudf_polars/GIT_COMMIT +1 -0
- cudf_polars/VERSION +1 -0
- cudf_polars/__init__.py +28 -0
- cudf_polars/_version.py +21 -0
- cudf_polars/callback.py +318 -0
- cudf_polars/containers/__init__.py +13 -0
- cudf_polars/containers/column.py +495 -0
- cudf_polars/containers/dataframe.py +361 -0
- cudf_polars/containers/datatype.py +137 -0
- cudf_polars/dsl/__init__.py +8 -0
- cudf_polars/dsl/expr.py +66 -0
- cudf_polars/dsl/expressions/__init__.py +8 -0
- cudf_polars/dsl/expressions/aggregation.py +226 -0
- cudf_polars/dsl/expressions/base.py +272 -0
- cudf_polars/dsl/expressions/binaryop.py +120 -0
- cudf_polars/dsl/expressions/boolean.py +326 -0
- cudf_polars/dsl/expressions/datetime.py +271 -0
- cudf_polars/dsl/expressions/literal.py +97 -0
- cudf_polars/dsl/expressions/rolling.py +643 -0
- cudf_polars/dsl/expressions/selection.py +74 -0
- cudf_polars/dsl/expressions/slicing.py +46 -0
- cudf_polars/dsl/expressions/sorting.py +85 -0
- cudf_polars/dsl/expressions/string.py +1002 -0
- cudf_polars/dsl/expressions/struct.py +137 -0
- cudf_polars/dsl/expressions/ternary.py +49 -0
- cudf_polars/dsl/expressions/unary.py +517 -0
- cudf_polars/dsl/ir.py +2607 -0
- cudf_polars/dsl/nodebase.py +164 -0
- cudf_polars/dsl/to_ast.py +359 -0
- cudf_polars/dsl/tracing.py +16 -0
- cudf_polars/dsl/translate.py +939 -0
- cudf_polars/dsl/traversal.py +224 -0
- cudf_polars/dsl/utils/__init__.py +8 -0
- cudf_polars/dsl/utils/aggregations.py +481 -0
- cudf_polars/dsl/utils/groupby.py +98 -0
- cudf_polars/dsl/utils/naming.py +34 -0
- cudf_polars/dsl/utils/replace.py +61 -0
- cudf_polars/dsl/utils/reshape.py +74 -0
- cudf_polars/dsl/utils/rolling.py +121 -0
- cudf_polars/dsl/utils/windows.py +192 -0
- cudf_polars/experimental/__init__.py +8 -0
- cudf_polars/experimental/base.py +386 -0
- cudf_polars/experimental/benchmarks/__init__.py +4 -0
- cudf_polars/experimental/benchmarks/pdsds.py +220 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/__init__.py +4 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q1.py +88 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q10.py +225 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q2.py +244 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q3.py +65 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q4.py +359 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q5.py +462 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q6.py +92 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q7.py +79 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q8.py +524 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q9.py +137 -0
- cudf_polars/experimental/benchmarks/pdsh.py +814 -0
- cudf_polars/experimental/benchmarks/utils.py +832 -0
- cudf_polars/experimental/dask_registers.py +200 -0
- cudf_polars/experimental/dispatch.py +156 -0
- cudf_polars/experimental/distinct.py +197 -0
- cudf_polars/experimental/explain.py +157 -0
- cudf_polars/experimental/expressions.py +590 -0
- cudf_polars/experimental/groupby.py +327 -0
- cudf_polars/experimental/io.py +943 -0
- cudf_polars/experimental/join.py +391 -0
- cudf_polars/experimental/parallel.py +423 -0
- cudf_polars/experimental/repartition.py +69 -0
- cudf_polars/experimental/scheduler.py +155 -0
- cudf_polars/experimental/select.py +188 -0
- cudf_polars/experimental/shuffle.py +354 -0
- cudf_polars/experimental/sort.py +609 -0
- cudf_polars/experimental/spilling.py +151 -0
- cudf_polars/experimental/statistics.py +795 -0
- cudf_polars/experimental/utils.py +169 -0
- cudf_polars/py.typed +0 -0
- cudf_polars/testing/__init__.py +8 -0
- cudf_polars/testing/asserts.py +448 -0
- cudf_polars/testing/io.py +122 -0
- cudf_polars/testing/plugin.py +236 -0
- cudf_polars/typing/__init__.py +219 -0
- cudf_polars/utils/__init__.py +8 -0
- cudf_polars/utils/config.py +741 -0
- cudf_polars/utils/conversion.py +40 -0
- cudf_polars/utils/dtypes.py +118 -0
- cudf_polars/utils/sorting.py +53 -0
- cudf_polars/utils/timer.py +39 -0
- cudf_polars/utils/versions.py +27 -0
- cudf_polars_cu13-25.10.0.dist-info/METADATA +136 -0
- cudf_polars_cu13-25.10.0.dist-info/RECORD +92 -0
- cudf_polars_cu13-25.10.0.dist-info/WHEEL +5 -0
- cudf_polars_cu13-25.10.0.dist-info/licenses/LICENSE +201 -0
- cudf_polars_cu13-25.10.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
# TODO: Document BooleanFunction to remove noqa
|
|
4
|
+
# ruff: noqa: D101
|
|
5
|
+
"""Boolean DSL nodes."""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from enum import IntEnum, auto
|
|
10
|
+
from functools import partial, reduce
|
|
11
|
+
from typing import TYPE_CHECKING, Any, ClassVar
|
|
12
|
+
|
|
13
|
+
import pylibcudf as plc
|
|
14
|
+
|
|
15
|
+
from cudf_polars.containers import Column, DataType
|
|
16
|
+
from cudf_polars.dsl.expressions.base import (
|
|
17
|
+
ExecutionContext,
|
|
18
|
+
Expr,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from typing_extensions import Self
|
|
23
|
+
|
|
24
|
+
import polars.type_aliases as pl_types
|
|
25
|
+
from polars.polars import _expr_nodes as pl_expr
|
|
26
|
+
|
|
27
|
+
from cudf_polars.containers import DataFrame
|
|
28
|
+
|
|
29
|
+
__all__ = ["BooleanFunction"]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class BooleanFunction(Expr):
|
|
33
|
+
class Name(IntEnum):
|
|
34
|
+
"""Internal and picklable representation of polars' `BooleanFunction`."""
|
|
35
|
+
|
|
36
|
+
All = auto()
|
|
37
|
+
AllHorizontal = auto()
|
|
38
|
+
Any = auto()
|
|
39
|
+
AnyHorizontal = auto()
|
|
40
|
+
IsBetween = auto()
|
|
41
|
+
IsClose = auto()
|
|
42
|
+
IsDuplicated = auto()
|
|
43
|
+
IsFinite = auto()
|
|
44
|
+
IsFirstDistinct = auto()
|
|
45
|
+
IsIn = auto()
|
|
46
|
+
IsInfinite = auto()
|
|
47
|
+
IsLastDistinct = auto()
|
|
48
|
+
IsNan = auto()
|
|
49
|
+
IsNotNan = auto()
|
|
50
|
+
IsNotNull = auto()
|
|
51
|
+
IsNull = auto()
|
|
52
|
+
IsUnique = auto()
|
|
53
|
+
Not = auto()
|
|
54
|
+
|
|
55
|
+
@classmethod
|
|
56
|
+
def from_polars(cls, obj: pl_expr.BooleanFunction) -> Self:
|
|
57
|
+
"""Convert from polars' `BooleanFunction`."""
|
|
58
|
+
try:
|
|
59
|
+
function, name = str(obj).split(".", maxsplit=1)
|
|
60
|
+
except ValueError:
|
|
61
|
+
# Failed to unpack string
|
|
62
|
+
function = None
|
|
63
|
+
if function != "BooleanFunction":
|
|
64
|
+
raise ValueError("BooleanFunction required")
|
|
65
|
+
return getattr(cls, name)
|
|
66
|
+
|
|
67
|
+
__slots__ = ("name", "options")
|
|
68
|
+
_non_child = ("dtype", "name", "options")
|
|
69
|
+
|
|
70
|
+
def __init__(
|
|
71
|
+
self,
|
|
72
|
+
dtype: DataType,
|
|
73
|
+
name: BooleanFunction.Name,
|
|
74
|
+
options: tuple[Any, ...],
|
|
75
|
+
*children: Expr,
|
|
76
|
+
) -> None:
|
|
77
|
+
self.dtype = dtype
|
|
78
|
+
self.options = options
|
|
79
|
+
self.name = name
|
|
80
|
+
self.children = children
|
|
81
|
+
self.is_pointwise = self.name not in (
|
|
82
|
+
BooleanFunction.Name.All,
|
|
83
|
+
BooleanFunction.Name.Any,
|
|
84
|
+
BooleanFunction.Name.IsDuplicated,
|
|
85
|
+
BooleanFunction.Name.IsFirstDistinct,
|
|
86
|
+
BooleanFunction.Name.IsLastDistinct,
|
|
87
|
+
BooleanFunction.Name.IsUnique,
|
|
88
|
+
)
|
|
89
|
+
if self.name in {
|
|
90
|
+
BooleanFunction.Name.IsClose,
|
|
91
|
+
}:
|
|
92
|
+
raise NotImplementedError(
|
|
93
|
+
f"Boolean function {self.name}"
|
|
94
|
+
) # pragma: no cover
|
|
95
|
+
|
|
96
|
+
@staticmethod
|
|
97
|
+
def _distinct(
|
|
98
|
+
column: Column,
|
|
99
|
+
dtype: DataType,
|
|
100
|
+
*,
|
|
101
|
+
keep: plc.stream_compaction.DuplicateKeepOption,
|
|
102
|
+
source_value: plc.Scalar,
|
|
103
|
+
target_value: plc.Scalar,
|
|
104
|
+
) -> Column:
|
|
105
|
+
table = plc.Table([column.obj])
|
|
106
|
+
indices = plc.stream_compaction.distinct_indices(
|
|
107
|
+
table,
|
|
108
|
+
keep,
|
|
109
|
+
# TODO: polars doesn't expose options for these
|
|
110
|
+
plc.types.NullEquality.EQUAL,
|
|
111
|
+
plc.types.NanEquality.ALL_EQUAL,
|
|
112
|
+
)
|
|
113
|
+
return Column(
|
|
114
|
+
plc.copying.scatter(
|
|
115
|
+
[source_value],
|
|
116
|
+
indices,
|
|
117
|
+
plc.Table([plc.Column.from_scalar(target_value, table.num_rows())]),
|
|
118
|
+
).columns()[0],
|
|
119
|
+
dtype=dtype,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
_BETWEEN_OPS: ClassVar[
|
|
123
|
+
dict[
|
|
124
|
+
pl_types.ClosedInterval,
|
|
125
|
+
tuple[plc.binaryop.BinaryOperator, plc.binaryop.BinaryOperator],
|
|
126
|
+
]
|
|
127
|
+
] = {
|
|
128
|
+
"none": (
|
|
129
|
+
plc.binaryop.BinaryOperator.GREATER,
|
|
130
|
+
plc.binaryop.BinaryOperator.LESS,
|
|
131
|
+
),
|
|
132
|
+
"left": (
|
|
133
|
+
plc.binaryop.BinaryOperator.GREATER_EQUAL,
|
|
134
|
+
plc.binaryop.BinaryOperator.LESS,
|
|
135
|
+
),
|
|
136
|
+
"right": (
|
|
137
|
+
plc.binaryop.BinaryOperator.GREATER,
|
|
138
|
+
plc.binaryop.BinaryOperator.LESS_EQUAL,
|
|
139
|
+
),
|
|
140
|
+
"both": (
|
|
141
|
+
plc.binaryop.BinaryOperator.GREATER_EQUAL,
|
|
142
|
+
plc.binaryop.BinaryOperator.LESS_EQUAL,
|
|
143
|
+
),
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
def do_evaluate(
|
|
147
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
148
|
+
) -> Column:
|
|
149
|
+
"""Evaluate this expression given a dataframe for context."""
|
|
150
|
+
if self.name in (
|
|
151
|
+
BooleanFunction.Name.IsFinite,
|
|
152
|
+
BooleanFunction.Name.IsInfinite,
|
|
153
|
+
):
|
|
154
|
+
# Avoid evaluating the child if the dtype tells us it's unnecessary.
|
|
155
|
+
(child,) = self.children
|
|
156
|
+
needles = child.evaluate(df, context=context)
|
|
157
|
+
is_float = needles.obj.type().id() in (
|
|
158
|
+
plc.TypeId.FLOAT32,
|
|
159
|
+
plc.TypeId.FLOAT64,
|
|
160
|
+
)
|
|
161
|
+
is_finite = self.name is BooleanFunction.Name.IsFinite
|
|
162
|
+
if not is_float:
|
|
163
|
+
base = plc.Column.from_scalar(
|
|
164
|
+
plc.Scalar.from_py(py_val=is_finite), needles.size
|
|
165
|
+
)
|
|
166
|
+
out = base.with_mask(needles.obj.null_mask(), needles.null_count)
|
|
167
|
+
return Column(out, dtype=self.dtype)
|
|
168
|
+
to_search = [-float("inf"), float("inf")]
|
|
169
|
+
if is_finite:
|
|
170
|
+
# NaN is neither finite not infinite
|
|
171
|
+
to_search.append(float("nan"))
|
|
172
|
+
haystack = plc.Column.from_iterable_of_py(
|
|
173
|
+
to_search,
|
|
174
|
+
dtype=needles.obj.type(),
|
|
175
|
+
)
|
|
176
|
+
result = plc.search.contains(haystack, needles.obj)
|
|
177
|
+
if is_finite:
|
|
178
|
+
result = plc.unary.unary_operation(result, plc.unary.UnaryOperator.NOT)
|
|
179
|
+
return Column(
|
|
180
|
+
result.with_mask(needles.obj.null_mask(), needles.null_count),
|
|
181
|
+
dtype=self.dtype,
|
|
182
|
+
)
|
|
183
|
+
columns = [child.evaluate(df, context=context) for child in self.children]
|
|
184
|
+
# Kleene logic for Any (OR) and All (AND) if ignore_nulls is
|
|
185
|
+
# False
|
|
186
|
+
if self.name in (BooleanFunction.Name.Any, BooleanFunction.Name.All):
|
|
187
|
+
(ignore_nulls,) = self.options
|
|
188
|
+
(column,) = columns
|
|
189
|
+
is_any = self.name is BooleanFunction.Name.Any
|
|
190
|
+
agg = plc.aggregation.any() if is_any else plc.aggregation.all()
|
|
191
|
+
result = plc.reduce.reduce(column.obj, agg, self.dtype.plc)
|
|
192
|
+
if not ignore_nulls and column.null_count > 0:
|
|
193
|
+
# Truth tables
|
|
194
|
+
# Any All
|
|
195
|
+
# | F U T | F U T
|
|
196
|
+
# --+------ --+------
|
|
197
|
+
# F | F U T F | F F F
|
|
198
|
+
# U | U U T U | F U U
|
|
199
|
+
# T | T T T T | F U T
|
|
200
|
+
#
|
|
201
|
+
# If the input null count was non-zero, we must
|
|
202
|
+
# post-process the result to insert the correct value.
|
|
203
|
+
h_result = result.to_py()
|
|
204
|
+
if (is_any and not h_result) or (not is_any and h_result):
|
|
205
|
+
# Any All
|
|
206
|
+
# False || Null => Null True && Null => Null
|
|
207
|
+
return Column(
|
|
208
|
+
plc.Column.all_null_like(column.obj, 1), dtype=self.dtype
|
|
209
|
+
)
|
|
210
|
+
return Column(plc.Column.from_scalar(result, 1), dtype=self.dtype)
|
|
211
|
+
if self.name is BooleanFunction.Name.IsNull:
|
|
212
|
+
(column,) = columns
|
|
213
|
+
return Column(plc.unary.is_null(column.obj), dtype=self.dtype)
|
|
214
|
+
elif self.name is BooleanFunction.Name.IsNotNull:
|
|
215
|
+
(column,) = columns
|
|
216
|
+
return Column(plc.unary.is_valid(column.obj), dtype=self.dtype)
|
|
217
|
+
elif self.name in (BooleanFunction.Name.IsNan, BooleanFunction.Name.IsNotNan):
|
|
218
|
+
(column,) = columns
|
|
219
|
+
is_float = column.obj.type().id() in (
|
|
220
|
+
plc.TypeId.FLOAT32,
|
|
221
|
+
plc.TypeId.FLOAT64,
|
|
222
|
+
)
|
|
223
|
+
if is_float:
|
|
224
|
+
op = (
|
|
225
|
+
plc.unary.is_nan
|
|
226
|
+
if self.name is BooleanFunction.Name.IsNan
|
|
227
|
+
else plc.unary.is_not_nan
|
|
228
|
+
)
|
|
229
|
+
base = op(column.obj)
|
|
230
|
+
else:
|
|
231
|
+
base = plc.Column.from_scalar(
|
|
232
|
+
plc.Scalar.from_py(
|
|
233
|
+
py_val=self.name is not BooleanFunction.Name.IsNan
|
|
234
|
+
),
|
|
235
|
+
column.size,
|
|
236
|
+
)
|
|
237
|
+
out = base.with_mask(column.obj.null_mask(), column.null_count)
|
|
238
|
+
return Column(out, dtype=self.dtype)
|
|
239
|
+
elif self.name is BooleanFunction.Name.IsFirstDistinct:
|
|
240
|
+
(column,) = columns
|
|
241
|
+
return self._distinct(
|
|
242
|
+
column,
|
|
243
|
+
dtype=self.dtype,
|
|
244
|
+
keep=plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
|
|
245
|
+
source_value=plc.Scalar.from_py(py_val=True, dtype=self.dtype.plc),
|
|
246
|
+
target_value=plc.Scalar.from_py(py_val=False, dtype=self.dtype.plc),
|
|
247
|
+
)
|
|
248
|
+
elif self.name is BooleanFunction.Name.IsLastDistinct:
|
|
249
|
+
(column,) = columns
|
|
250
|
+
return self._distinct(
|
|
251
|
+
column,
|
|
252
|
+
dtype=self.dtype,
|
|
253
|
+
keep=plc.stream_compaction.DuplicateKeepOption.KEEP_LAST,
|
|
254
|
+
source_value=plc.Scalar.from_py(py_val=True, dtype=self.dtype.plc),
|
|
255
|
+
target_value=plc.Scalar.from_py(py_val=False, dtype=self.dtype.plc),
|
|
256
|
+
)
|
|
257
|
+
elif self.name is BooleanFunction.Name.IsUnique:
|
|
258
|
+
(column,) = columns
|
|
259
|
+
return self._distinct(
|
|
260
|
+
column,
|
|
261
|
+
dtype=self.dtype,
|
|
262
|
+
keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
|
|
263
|
+
source_value=plc.Scalar.from_py(py_val=True, dtype=self.dtype.plc),
|
|
264
|
+
target_value=plc.Scalar.from_py(py_val=False, dtype=self.dtype.plc),
|
|
265
|
+
)
|
|
266
|
+
elif self.name is BooleanFunction.Name.IsDuplicated:
|
|
267
|
+
(column,) = columns
|
|
268
|
+
return self._distinct(
|
|
269
|
+
column,
|
|
270
|
+
dtype=self.dtype,
|
|
271
|
+
keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
|
|
272
|
+
source_value=plc.Scalar.from_py(py_val=False, dtype=self.dtype.plc),
|
|
273
|
+
target_value=plc.Scalar.from_py(py_val=True, dtype=self.dtype.plc),
|
|
274
|
+
)
|
|
275
|
+
elif self.name is BooleanFunction.Name.AllHorizontal:
|
|
276
|
+
return Column(
|
|
277
|
+
reduce(
|
|
278
|
+
partial(
|
|
279
|
+
plc.binaryop.binary_operation,
|
|
280
|
+
op=plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
|
|
281
|
+
output_type=self.dtype.plc,
|
|
282
|
+
),
|
|
283
|
+
(c.obj for c in columns),
|
|
284
|
+
),
|
|
285
|
+
dtype=self.dtype,
|
|
286
|
+
)
|
|
287
|
+
elif self.name is BooleanFunction.Name.AnyHorizontal:
|
|
288
|
+
return Column(
|
|
289
|
+
reduce(
|
|
290
|
+
partial(
|
|
291
|
+
plc.binaryop.binary_operation,
|
|
292
|
+
op=plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
|
|
293
|
+
output_type=self.dtype.plc,
|
|
294
|
+
),
|
|
295
|
+
(c.obj for c in columns),
|
|
296
|
+
),
|
|
297
|
+
dtype=self.dtype,
|
|
298
|
+
)
|
|
299
|
+
elif self.name is BooleanFunction.Name.IsIn:
|
|
300
|
+
needles, haystack = columns
|
|
301
|
+
if haystack.obj.type().id() == plc.TypeId.LIST:
|
|
302
|
+
# Unwrap values from the list column
|
|
303
|
+
# the type: ignore is safe because we know that the type ID is LIST,
|
|
304
|
+
# which always has an inner attribute.
|
|
305
|
+
haystack = Column(
|
|
306
|
+
haystack.obj.children()[1],
|
|
307
|
+
dtype=DataType(haystack.dtype.polars.inner), # type: ignore[attr-defined]
|
|
308
|
+
).astype(needles.dtype)
|
|
309
|
+
if haystack.size:
|
|
310
|
+
return Column(
|
|
311
|
+
plc.search.contains(haystack.obj, needles.obj), dtype=self.dtype
|
|
312
|
+
)
|
|
313
|
+
return Column(
|
|
314
|
+
plc.Column.from_scalar(plc.Scalar.from_py(py_val=False), needles.size),
|
|
315
|
+
dtype=self.dtype,
|
|
316
|
+
)
|
|
317
|
+
elif self.name is BooleanFunction.Name.Not:
|
|
318
|
+
(column,) = columns
|
|
319
|
+
return Column(
|
|
320
|
+
plc.unary.unary_operation(column.obj, plc.unary.UnaryOperator.NOT),
|
|
321
|
+
dtype=self.dtype,
|
|
322
|
+
)
|
|
323
|
+
else:
|
|
324
|
+
raise NotImplementedError(
|
|
325
|
+
f"BooleanFunction {self.name}"
|
|
326
|
+
) # pragma: no cover; handled by init raising
|
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
# TODO: Document TemporalFunction to remove noqa
|
|
4
|
+
# ruff: noqa: D101
|
|
5
|
+
"""DSL nodes for datetime operations."""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from enum import IntEnum, auto
|
|
10
|
+
from typing import TYPE_CHECKING, Any, ClassVar
|
|
11
|
+
|
|
12
|
+
import pylibcudf as plc
|
|
13
|
+
|
|
14
|
+
from cudf_polars.containers import Column
|
|
15
|
+
from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from typing_extensions import Self
|
|
19
|
+
|
|
20
|
+
from polars.polars import _expr_nodes as pl_expr
|
|
21
|
+
|
|
22
|
+
from cudf_polars.containers import DataFrame, DataType
|
|
23
|
+
|
|
24
|
+
__all__ = ["TemporalFunction"]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class TemporalFunction(Expr):
|
|
28
|
+
class Name(IntEnum):
|
|
29
|
+
"""Internal and picklable representation of polars' `TemporalFunction`."""
|
|
30
|
+
|
|
31
|
+
BaseUtcOffset = auto()
|
|
32
|
+
CastTimeUnit = auto()
|
|
33
|
+
Century = auto()
|
|
34
|
+
Combine = auto()
|
|
35
|
+
ConvertTimeZone = auto()
|
|
36
|
+
DSTOffset = auto()
|
|
37
|
+
Date = auto()
|
|
38
|
+
Datetime = auto()
|
|
39
|
+
DatetimeFunction = auto()
|
|
40
|
+
Day = auto()
|
|
41
|
+
DaysInMonth = auto()
|
|
42
|
+
Duration = auto()
|
|
43
|
+
Hour = auto()
|
|
44
|
+
IsLeapYear = auto()
|
|
45
|
+
IsoYear = auto()
|
|
46
|
+
Microsecond = auto()
|
|
47
|
+
Millennium = auto()
|
|
48
|
+
Millisecond = auto()
|
|
49
|
+
Minute = auto()
|
|
50
|
+
Month = auto()
|
|
51
|
+
MonthEnd = auto()
|
|
52
|
+
MonthStart = auto()
|
|
53
|
+
Nanosecond = auto()
|
|
54
|
+
OffsetBy = auto()
|
|
55
|
+
OrdinalDay = auto()
|
|
56
|
+
Quarter = auto()
|
|
57
|
+
Replace = auto()
|
|
58
|
+
ReplaceTimeZone = auto()
|
|
59
|
+
Round = auto()
|
|
60
|
+
Second = auto()
|
|
61
|
+
Time = auto()
|
|
62
|
+
TimeStamp = auto()
|
|
63
|
+
ToString = auto()
|
|
64
|
+
TotalDays = auto()
|
|
65
|
+
TotalHours = auto()
|
|
66
|
+
TotalMicroseconds = auto()
|
|
67
|
+
TotalMilliseconds = auto()
|
|
68
|
+
TotalMinutes = auto()
|
|
69
|
+
TotalNanoseconds = auto()
|
|
70
|
+
TotalSeconds = auto()
|
|
71
|
+
Truncate = auto()
|
|
72
|
+
Week = auto()
|
|
73
|
+
WeekDay = auto()
|
|
74
|
+
WithTimeUnit = auto()
|
|
75
|
+
Year = auto()
|
|
76
|
+
|
|
77
|
+
@classmethod
|
|
78
|
+
def from_polars(cls, obj: pl_expr.TemporalFunction) -> Self:
|
|
79
|
+
"""Convert from polars' `TemporalFunction`."""
|
|
80
|
+
try:
|
|
81
|
+
function, name = str(obj).split(".", maxsplit=1)
|
|
82
|
+
except ValueError:
|
|
83
|
+
# Failed to unpack string
|
|
84
|
+
function = None
|
|
85
|
+
if function != "TemporalFunction":
|
|
86
|
+
raise ValueError("TemporalFunction required")
|
|
87
|
+
return getattr(cls, name)
|
|
88
|
+
|
|
89
|
+
__slots__ = ("name", "options")
|
|
90
|
+
_non_child = ("dtype", "name", "options")
|
|
91
|
+
_COMPONENT_MAP: ClassVar[dict[Name, plc.datetime.DatetimeComponent]] = {
|
|
92
|
+
Name.Year: plc.datetime.DatetimeComponent.YEAR,
|
|
93
|
+
Name.Month: plc.datetime.DatetimeComponent.MONTH,
|
|
94
|
+
Name.Day: plc.datetime.DatetimeComponent.DAY,
|
|
95
|
+
Name.WeekDay: plc.datetime.DatetimeComponent.WEEKDAY,
|
|
96
|
+
Name.Hour: plc.datetime.DatetimeComponent.HOUR,
|
|
97
|
+
Name.Minute: plc.datetime.DatetimeComponent.MINUTE,
|
|
98
|
+
Name.Second: plc.datetime.DatetimeComponent.SECOND,
|
|
99
|
+
Name.Millisecond: plc.datetime.DatetimeComponent.MILLISECOND,
|
|
100
|
+
Name.Microsecond: plc.datetime.DatetimeComponent.MICROSECOND,
|
|
101
|
+
Name.Nanosecond: plc.datetime.DatetimeComponent.NANOSECOND,
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
_valid_ops: ClassVar[set[Name]] = {
|
|
105
|
+
*_COMPONENT_MAP.keys(),
|
|
106
|
+
Name.IsLeapYear,
|
|
107
|
+
Name.OrdinalDay,
|
|
108
|
+
Name.ToString,
|
|
109
|
+
Name.Week,
|
|
110
|
+
Name.IsoYear,
|
|
111
|
+
Name.MonthStart,
|
|
112
|
+
Name.MonthEnd,
|
|
113
|
+
Name.CastTimeUnit,
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
def __init__(
|
|
117
|
+
self,
|
|
118
|
+
dtype: DataType,
|
|
119
|
+
name: TemporalFunction.Name,
|
|
120
|
+
options: tuple[Any, ...],
|
|
121
|
+
*children: Expr,
|
|
122
|
+
) -> None:
|
|
123
|
+
self.dtype = dtype
|
|
124
|
+
self.options = options
|
|
125
|
+
self.name = name
|
|
126
|
+
self.children = children
|
|
127
|
+
self.is_pointwise = True
|
|
128
|
+
if self.name not in self._valid_ops:
|
|
129
|
+
raise NotImplementedError(f"Temporal function {self.name}")
|
|
130
|
+
|
|
131
|
+
if self.name is TemporalFunction.Name.ToString and plc.traits.is_duration(
|
|
132
|
+
self.children[0].dtype.plc
|
|
133
|
+
):
|
|
134
|
+
raise NotImplementedError("ToString is not supported on duration types")
|
|
135
|
+
|
|
136
|
+
def do_evaluate(
|
|
137
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
138
|
+
) -> Column:
|
|
139
|
+
"""Evaluate this expression given a dataframe for context."""
|
|
140
|
+
columns = [child.evaluate(df, context=context) for child in self.children]
|
|
141
|
+
(column,) = columns
|
|
142
|
+
if self.name is TemporalFunction.Name.CastTimeUnit:
|
|
143
|
+
return Column(plc.unary.cast(column.obj, self.dtype.plc), dtype=self.dtype)
|
|
144
|
+
if self.name == TemporalFunction.Name.ToString:
|
|
145
|
+
return Column(
|
|
146
|
+
plc.strings.convert.convert_datetime.from_timestamps(
|
|
147
|
+
column.obj,
|
|
148
|
+
self.options[0],
|
|
149
|
+
plc.Column.from_iterable_of_py([], dtype=self.dtype.plc),
|
|
150
|
+
),
|
|
151
|
+
dtype=self.dtype,
|
|
152
|
+
)
|
|
153
|
+
if self.name is TemporalFunction.Name.Week:
|
|
154
|
+
result = plc.strings.convert.convert_integers.to_integers(
|
|
155
|
+
plc.strings.convert.convert_datetime.from_timestamps(
|
|
156
|
+
column.obj,
|
|
157
|
+
format="%V",
|
|
158
|
+
input_strings_names=plc.Column.from_iterable_of_py(
|
|
159
|
+
[], dtype=plc.DataType(plc.TypeId.STRING)
|
|
160
|
+
),
|
|
161
|
+
),
|
|
162
|
+
self.dtype.plc,
|
|
163
|
+
)
|
|
164
|
+
return Column(result, dtype=self.dtype)
|
|
165
|
+
if self.name is TemporalFunction.Name.IsoYear:
|
|
166
|
+
result = plc.strings.convert.convert_integers.to_integers(
|
|
167
|
+
plc.strings.convert.convert_datetime.from_timestamps(
|
|
168
|
+
column.obj,
|
|
169
|
+
format="%G",
|
|
170
|
+
input_strings_names=plc.Column.from_iterable_of_py(
|
|
171
|
+
[], dtype=plc.DataType(plc.TypeId.STRING)
|
|
172
|
+
),
|
|
173
|
+
),
|
|
174
|
+
self.dtype.plc,
|
|
175
|
+
)
|
|
176
|
+
return Column(result, dtype=self.dtype)
|
|
177
|
+
if self.name is TemporalFunction.Name.MonthStart:
|
|
178
|
+
ends = plc.datetime.last_day_of_month(column.obj)
|
|
179
|
+
days_to_subtract = plc.datetime.days_in_month(column.obj)
|
|
180
|
+
# must subtract 1 to avoid rolling over to the previous month
|
|
181
|
+
days_to_subtract = plc.binaryop.binary_operation(
|
|
182
|
+
days_to_subtract,
|
|
183
|
+
plc.Scalar.from_py(1, plc.DataType(plc.TypeId.INT32)),
|
|
184
|
+
plc.binaryop.BinaryOperator.SUB,
|
|
185
|
+
plc.DataType(plc.TypeId.DURATION_DAYS),
|
|
186
|
+
)
|
|
187
|
+
result = plc.binaryop.binary_operation(
|
|
188
|
+
ends,
|
|
189
|
+
days_to_subtract,
|
|
190
|
+
plc.binaryop.BinaryOperator.SUB,
|
|
191
|
+
self.dtype.plc,
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
return Column(result, dtype=self.dtype)
|
|
195
|
+
if self.name is TemporalFunction.Name.MonthEnd:
|
|
196
|
+
return Column(
|
|
197
|
+
plc.unary.cast(
|
|
198
|
+
plc.datetime.last_day_of_month(column.obj), self.dtype.plc
|
|
199
|
+
),
|
|
200
|
+
dtype=self.dtype,
|
|
201
|
+
)
|
|
202
|
+
if self.name is TemporalFunction.Name.IsLeapYear:
|
|
203
|
+
return Column(
|
|
204
|
+
plc.datetime.is_leap_year(column.obj),
|
|
205
|
+
dtype=self.dtype,
|
|
206
|
+
)
|
|
207
|
+
if self.name is TemporalFunction.Name.OrdinalDay:
|
|
208
|
+
return Column(plc.datetime.day_of_year(column.obj), dtype=self.dtype)
|
|
209
|
+
if self.name is TemporalFunction.Name.Microsecond:
|
|
210
|
+
millis = plc.datetime.extract_datetime_component(
|
|
211
|
+
column.obj, plc.datetime.DatetimeComponent.MILLISECOND
|
|
212
|
+
)
|
|
213
|
+
micros = plc.datetime.extract_datetime_component(
|
|
214
|
+
column.obj, plc.datetime.DatetimeComponent.MICROSECOND
|
|
215
|
+
)
|
|
216
|
+
millis_as_micros = plc.binaryop.binary_operation(
|
|
217
|
+
millis,
|
|
218
|
+
plc.Scalar.from_py(1_000, plc.DataType(plc.TypeId.INT32)),
|
|
219
|
+
plc.binaryop.BinaryOperator.MUL,
|
|
220
|
+
self.dtype.plc,
|
|
221
|
+
)
|
|
222
|
+
total_micros = plc.binaryop.binary_operation(
|
|
223
|
+
micros,
|
|
224
|
+
millis_as_micros,
|
|
225
|
+
plc.binaryop.BinaryOperator.ADD,
|
|
226
|
+
self.dtype.plc,
|
|
227
|
+
)
|
|
228
|
+
return Column(total_micros, dtype=self.dtype)
|
|
229
|
+
elif self.name is TemporalFunction.Name.Nanosecond:
|
|
230
|
+
millis = plc.datetime.extract_datetime_component(
|
|
231
|
+
column.obj, plc.datetime.DatetimeComponent.MILLISECOND
|
|
232
|
+
)
|
|
233
|
+
micros = plc.datetime.extract_datetime_component(
|
|
234
|
+
column.obj, plc.datetime.DatetimeComponent.MICROSECOND
|
|
235
|
+
)
|
|
236
|
+
nanos = plc.datetime.extract_datetime_component(
|
|
237
|
+
column.obj, plc.datetime.DatetimeComponent.NANOSECOND
|
|
238
|
+
)
|
|
239
|
+
millis_as_nanos = plc.binaryop.binary_operation(
|
|
240
|
+
millis,
|
|
241
|
+
plc.Scalar.from_py(1_000_000, plc.DataType(plc.TypeId.INT32)),
|
|
242
|
+
plc.binaryop.BinaryOperator.MUL,
|
|
243
|
+
self.dtype.plc,
|
|
244
|
+
)
|
|
245
|
+
micros_as_nanos = plc.binaryop.binary_operation(
|
|
246
|
+
micros,
|
|
247
|
+
plc.Scalar.from_py(1_000, plc.DataType(plc.TypeId.INT32)),
|
|
248
|
+
plc.binaryop.BinaryOperator.MUL,
|
|
249
|
+
self.dtype.plc,
|
|
250
|
+
)
|
|
251
|
+
total_nanos = plc.binaryop.binary_operation(
|
|
252
|
+
nanos,
|
|
253
|
+
millis_as_nanos,
|
|
254
|
+
plc.binaryop.BinaryOperator.ADD,
|
|
255
|
+
self.dtype.plc,
|
|
256
|
+
)
|
|
257
|
+
total_nanos = plc.binaryop.binary_operation(
|
|
258
|
+
total_nanos,
|
|
259
|
+
micros_as_nanos,
|
|
260
|
+
plc.binaryop.BinaryOperator.ADD,
|
|
261
|
+
self.dtype.plc,
|
|
262
|
+
)
|
|
263
|
+
return Column(total_nanos, dtype=self.dtype)
|
|
264
|
+
|
|
265
|
+
return Column(
|
|
266
|
+
plc.datetime.extract_datetime_component(
|
|
267
|
+
column.obj,
|
|
268
|
+
self._COMPONENT_MAP[self.name],
|
|
269
|
+
),
|
|
270
|
+
dtype=self.dtype,
|
|
271
|
+
)
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
# TODO: remove need for this
|
|
4
|
+
# ruff: noqa: D101
|
|
5
|
+
"""Literal DSL nodes."""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import TYPE_CHECKING, Any, NoReturn
|
|
10
|
+
|
|
11
|
+
import polars as pl
|
|
12
|
+
|
|
13
|
+
import pylibcudf as plc
|
|
14
|
+
|
|
15
|
+
from cudf_polars.containers import Column, DataType
|
|
16
|
+
from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from collections.abc import Hashable
|
|
20
|
+
|
|
21
|
+
from cudf_polars.containers import DataFrame
|
|
22
|
+
|
|
23
|
+
__all__ = ["Literal", "LiteralColumn"]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class Literal(Expr):
|
|
27
|
+
__slots__ = ("value",)
|
|
28
|
+
_non_child = ("dtype", "value")
|
|
29
|
+
value: Any # Python scalar
|
|
30
|
+
|
|
31
|
+
def __init__(self, dtype: DataType, value: Any) -> None:
|
|
32
|
+
if value is None and dtype.id() == plc.TypeId.EMPTY:
|
|
33
|
+
# TypeId.EMPTY not supported by libcudf
|
|
34
|
+
# cuDF Python also maps EMPTY to INT8
|
|
35
|
+
dtype = DataType(pl.datatypes.Int8())
|
|
36
|
+
self.dtype = dtype
|
|
37
|
+
self.value = value
|
|
38
|
+
self.children = ()
|
|
39
|
+
self.is_pointwise = True
|
|
40
|
+
|
|
41
|
+
def do_evaluate(
|
|
42
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
43
|
+
) -> Column:
|
|
44
|
+
"""Evaluate this expression given a dataframe for context."""
|
|
45
|
+
return Column(
|
|
46
|
+
plc.Column.from_scalar(plc.Scalar.from_py(self.value, self.dtype.plc), 1),
|
|
47
|
+
dtype=self.dtype,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def agg_request(self) -> NoReturn: # noqa: D102
|
|
52
|
+
raise NotImplementedError(
|
|
53
|
+
"Not expecting to require agg request of literal"
|
|
54
|
+
) # pragma: no cover
|
|
55
|
+
|
|
56
|
+
def astype(self, dtype: DataType) -> Literal:
|
|
57
|
+
"""Cast self to dtype."""
|
|
58
|
+
if self.value is None:
|
|
59
|
+
return Literal(dtype, self.value)
|
|
60
|
+
else:
|
|
61
|
+
# Use polars to cast instead of pylibcudf
|
|
62
|
+
# since there are just Python scalars
|
|
63
|
+
casted = pl.Series(values=[self.value], dtype=self.dtype.polars).cast(
|
|
64
|
+
dtype.polars
|
|
65
|
+
)[0]
|
|
66
|
+
return Literal(dtype, casted)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class LiteralColumn(Expr):
|
|
70
|
+
__slots__ = ("value",)
|
|
71
|
+
_non_child = ("dtype", "value")
|
|
72
|
+
value: pl.Series
|
|
73
|
+
|
|
74
|
+
def __init__(self, dtype: DataType, value: pl.Series) -> None:
|
|
75
|
+
self.dtype = dtype
|
|
76
|
+
self.value = value
|
|
77
|
+
self.children = ()
|
|
78
|
+
self.is_pointwise = True
|
|
79
|
+
|
|
80
|
+
def get_hashable(self) -> Hashable:
|
|
81
|
+
"""Compute a hash of the column."""
|
|
82
|
+
# This is stricter than necessary, but we only need this hash
|
|
83
|
+
# for identity in groupby replacements so it's OK. And this
|
|
84
|
+
# way we avoid doing potentially expensive compute.
|
|
85
|
+
return (type(self), self.dtype.plc, id(self.value))
|
|
86
|
+
|
|
87
|
+
def do_evaluate(
|
|
88
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
89
|
+
) -> Column:
|
|
90
|
+
"""Evaluate this expression given a dataframe for context."""
|
|
91
|
+
return Column(plc.Column.from_arrow(self.value), dtype=self.dtype)
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def agg_request(self) -> NoReturn: # noqa: D102
|
|
95
|
+
raise NotImplementedError(
|
|
96
|
+
"Not expecting to require agg request of literal"
|
|
97
|
+
) # pragma: no cover
|