cudf-polars-cu13 25.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudf_polars/GIT_COMMIT +1 -0
- cudf_polars/VERSION +1 -0
- cudf_polars/__init__.py +28 -0
- cudf_polars/_version.py +21 -0
- cudf_polars/callback.py +318 -0
- cudf_polars/containers/__init__.py +13 -0
- cudf_polars/containers/column.py +495 -0
- cudf_polars/containers/dataframe.py +361 -0
- cudf_polars/containers/datatype.py +137 -0
- cudf_polars/dsl/__init__.py +8 -0
- cudf_polars/dsl/expr.py +66 -0
- cudf_polars/dsl/expressions/__init__.py +8 -0
- cudf_polars/dsl/expressions/aggregation.py +226 -0
- cudf_polars/dsl/expressions/base.py +272 -0
- cudf_polars/dsl/expressions/binaryop.py +120 -0
- cudf_polars/dsl/expressions/boolean.py +326 -0
- cudf_polars/dsl/expressions/datetime.py +271 -0
- cudf_polars/dsl/expressions/literal.py +97 -0
- cudf_polars/dsl/expressions/rolling.py +643 -0
- cudf_polars/dsl/expressions/selection.py +74 -0
- cudf_polars/dsl/expressions/slicing.py +46 -0
- cudf_polars/dsl/expressions/sorting.py +85 -0
- cudf_polars/dsl/expressions/string.py +1002 -0
- cudf_polars/dsl/expressions/struct.py +137 -0
- cudf_polars/dsl/expressions/ternary.py +49 -0
- cudf_polars/dsl/expressions/unary.py +517 -0
- cudf_polars/dsl/ir.py +2607 -0
- cudf_polars/dsl/nodebase.py +164 -0
- cudf_polars/dsl/to_ast.py +359 -0
- cudf_polars/dsl/tracing.py +16 -0
- cudf_polars/dsl/translate.py +939 -0
- cudf_polars/dsl/traversal.py +224 -0
- cudf_polars/dsl/utils/__init__.py +8 -0
- cudf_polars/dsl/utils/aggregations.py +481 -0
- cudf_polars/dsl/utils/groupby.py +98 -0
- cudf_polars/dsl/utils/naming.py +34 -0
- cudf_polars/dsl/utils/replace.py +61 -0
- cudf_polars/dsl/utils/reshape.py +74 -0
- cudf_polars/dsl/utils/rolling.py +121 -0
- cudf_polars/dsl/utils/windows.py +192 -0
- cudf_polars/experimental/__init__.py +8 -0
- cudf_polars/experimental/base.py +386 -0
- cudf_polars/experimental/benchmarks/__init__.py +4 -0
- cudf_polars/experimental/benchmarks/pdsds.py +220 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/__init__.py +4 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q1.py +88 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q10.py +225 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q2.py +244 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q3.py +65 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q4.py +359 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q5.py +462 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q6.py +92 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q7.py +79 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q8.py +524 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q9.py +137 -0
- cudf_polars/experimental/benchmarks/pdsh.py +814 -0
- cudf_polars/experimental/benchmarks/utils.py +832 -0
- cudf_polars/experimental/dask_registers.py +200 -0
- cudf_polars/experimental/dispatch.py +156 -0
- cudf_polars/experimental/distinct.py +197 -0
- cudf_polars/experimental/explain.py +157 -0
- cudf_polars/experimental/expressions.py +590 -0
- cudf_polars/experimental/groupby.py +327 -0
- cudf_polars/experimental/io.py +943 -0
- cudf_polars/experimental/join.py +391 -0
- cudf_polars/experimental/parallel.py +423 -0
- cudf_polars/experimental/repartition.py +69 -0
- cudf_polars/experimental/scheduler.py +155 -0
- cudf_polars/experimental/select.py +188 -0
- cudf_polars/experimental/shuffle.py +354 -0
- cudf_polars/experimental/sort.py +609 -0
- cudf_polars/experimental/spilling.py +151 -0
- cudf_polars/experimental/statistics.py +795 -0
- cudf_polars/experimental/utils.py +169 -0
- cudf_polars/py.typed +0 -0
- cudf_polars/testing/__init__.py +8 -0
- cudf_polars/testing/asserts.py +448 -0
- cudf_polars/testing/io.py +122 -0
- cudf_polars/testing/plugin.py +236 -0
- cudf_polars/typing/__init__.py +219 -0
- cudf_polars/utils/__init__.py +8 -0
- cudf_polars/utils/config.py +741 -0
- cudf_polars/utils/conversion.py +40 -0
- cudf_polars/utils/dtypes.py +118 -0
- cudf_polars/utils/sorting.py +53 -0
- cudf_polars/utils/timer.py +39 -0
- cudf_polars/utils/versions.py +27 -0
- cudf_polars_cu13-25.10.0.dist-info/METADATA +136 -0
- cudf_polars_cu13-25.10.0.dist-info/RECORD +92 -0
- cudf_polars_cu13-25.10.0.dist-info/WHEEL +5 -0
- cudf_polars_cu13-25.10.0.dist-info/licenses/LICENSE +201 -0
- cudf_polars_cu13-25.10.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
"""Base class for IR nodes, and utilities."""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from typing import TYPE_CHECKING, Any, ClassVar, Generic, TypeVar
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from collections.abc import Generator, Hashable, Sequence
|
|
12
|
+
|
|
13
|
+
from typing_extensions import Self
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
__all__: list[str] = ["Node"]
|
|
17
|
+
|
|
18
|
+
T = TypeVar("T", bound="Node[Any]")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Node(Generic[T]):
|
|
22
|
+
"""
|
|
23
|
+
An abstract node type.
|
|
24
|
+
|
|
25
|
+
Nodes are immutable!
|
|
26
|
+
|
|
27
|
+
This contains a (potentially empty) tuple of child nodes,
|
|
28
|
+
along with non-child data. For uniform reconstruction and
|
|
29
|
+
implementation of hashing and equality schemes, child classes need
|
|
30
|
+
to provide a certain amount of metadata when they are defined.
|
|
31
|
+
Specifically, the ``_non_child`` attribute must list, in-order,
|
|
32
|
+
the names of the slots that are passed to the constructor. The
|
|
33
|
+
constructor must take arguments in the order ``(*_non_child,
|
|
34
|
+
*children).``
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
__slots__ = ("_hash_value", "_repr_value", "children")
|
|
38
|
+
_hash_value: int
|
|
39
|
+
_repr_value: str
|
|
40
|
+
children: tuple[T, ...]
|
|
41
|
+
_non_child: ClassVar[tuple[str, ...]] = ()
|
|
42
|
+
|
|
43
|
+
def _ctor_arguments(self, children: Sequence[T]) -> Sequence[Any | T]:
|
|
44
|
+
return (*(getattr(self, attr) for attr in self._non_child), *children)
|
|
45
|
+
|
|
46
|
+
def reconstruct(self, children: Sequence[T]) -> Self:
|
|
47
|
+
"""
|
|
48
|
+
Rebuild this node with new children.
|
|
49
|
+
|
|
50
|
+
Parameters
|
|
51
|
+
----------
|
|
52
|
+
children
|
|
53
|
+
New children
|
|
54
|
+
|
|
55
|
+
Returns
|
|
56
|
+
-------
|
|
57
|
+
New node with new children. Non-child data is shared with the input.
|
|
58
|
+
"""
|
|
59
|
+
return type(self)(*self._ctor_arguments(children))
|
|
60
|
+
|
|
61
|
+
def __reduce__(self) -> tuple[Any, ...]:
|
|
62
|
+
"""Pickle a Node object."""
|
|
63
|
+
return (
|
|
64
|
+
type(self),
|
|
65
|
+
self._ctor_arguments(self.children),
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
def get_hashable(self) -> Hashable:
|
|
69
|
+
"""
|
|
70
|
+
Return a hashable object for the node.
|
|
71
|
+
|
|
72
|
+
Returns
|
|
73
|
+
-------
|
|
74
|
+
Hashable object.
|
|
75
|
+
|
|
76
|
+
Notes
|
|
77
|
+
-----
|
|
78
|
+
This method is used by the :meth:`__hash__` implementation
|
|
79
|
+
(which does caching). If your node type needs special-case
|
|
80
|
+
handling for some of its attributes, override this method, not
|
|
81
|
+
:meth:`__hash__`.
|
|
82
|
+
"""
|
|
83
|
+
return (type(self), self._ctor_arguments(self.children))
|
|
84
|
+
|
|
85
|
+
def __hash__(self) -> int:
|
|
86
|
+
"""
|
|
87
|
+
Hash of an expression with caching.
|
|
88
|
+
|
|
89
|
+
See Also
|
|
90
|
+
--------
|
|
91
|
+
get_hashable
|
|
92
|
+
"""
|
|
93
|
+
try:
|
|
94
|
+
return self._hash_value
|
|
95
|
+
except AttributeError:
|
|
96
|
+
self._hash_value = hash(self.get_hashable())
|
|
97
|
+
return self._hash_value
|
|
98
|
+
|
|
99
|
+
def is_equal(self, other: Self) -> bool:
|
|
100
|
+
"""
|
|
101
|
+
Equality of two nodes of equal type.
|
|
102
|
+
|
|
103
|
+
Override this in subclasses, rather than :meth:`__eq__`.
|
|
104
|
+
|
|
105
|
+
Parameters
|
|
106
|
+
----------
|
|
107
|
+
other
|
|
108
|
+
object of same type to compare to.
|
|
109
|
+
|
|
110
|
+
Notes
|
|
111
|
+
-----
|
|
112
|
+
Since nodes are immutable, this does common subexpression
|
|
113
|
+
elimination when two nodes are determined to be equal.
|
|
114
|
+
|
|
115
|
+
:meth:`__eq__` handles the case where the objects being
|
|
116
|
+
compared are not of the same type, so in this method, we only
|
|
117
|
+
need to implement equality of equal types.
|
|
118
|
+
|
|
119
|
+
Returns
|
|
120
|
+
-------
|
|
121
|
+
True if the two nodes are equal, false otherwise.
|
|
122
|
+
"""
|
|
123
|
+
if self is other:
|
|
124
|
+
return True
|
|
125
|
+
result = self._ctor_arguments(self.children) == other._ctor_arguments(
|
|
126
|
+
other.children
|
|
127
|
+
)
|
|
128
|
+
# Eager CSE for nodes that match.
|
|
129
|
+
if result:
|
|
130
|
+
self.children = other.children
|
|
131
|
+
return result
|
|
132
|
+
|
|
133
|
+
def __eq__(self, other: Any) -> bool:
|
|
134
|
+
"""
|
|
135
|
+
Equality of expressions.
|
|
136
|
+
|
|
137
|
+
See Also
|
|
138
|
+
--------
|
|
139
|
+
is_equal
|
|
140
|
+
"""
|
|
141
|
+
if type(self) is not type(other) or hash(self) != hash(other):
|
|
142
|
+
return False
|
|
143
|
+
else:
|
|
144
|
+
return self.is_equal(other)
|
|
145
|
+
|
|
146
|
+
def __ne__(self, other: Any) -> bool:
|
|
147
|
+
"""Inequality of expressions."""
|
|
148
|
+
return not self.__eq__(other)
|
|
149
|
+
|
|
150
|
+
def __repr__(self) -> str:
|
|
151
|
+
"""String representation of an expression with caching."""
|
|
152
|
+
try:
|
|
153
|
+
return self._repr_value
|
|
154
|
+
except AttributeError:
|
|
155
|
+
args = ", ".join(f"{arg!r}" for arg in self._ctor_arguments(self.children))
|
|
156
|
+
self._repr_value = f"{type(self).__name__}({args})"
|
|
157
|
+
return self._repr_value
|
|
158
|
+
|
|
159
|
+
def __rich_repr__(self) -> Generator[Any, None, None]:
|
|
160
|
+
"""Formatting for rich.pretty.pprint."""
|
|
161
|
+
for attr in self._non_child:
|
|
162
|
+
yield attr, getattr(self, attr)
|
|
163
|
+
|
|
164
|
+
yield from self.children
|
|
@@ -0,0 +1,359 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
"""Conversion of expression nodes to libcudf AST nodes."""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from functools import partial, reduce, singledispatch
|
|
9
|
+
from typing import TYPE_CHECKING, TypeAlias, TypedDict
|
|
10
|
+
|
|
11
|
+
import pylibcudf as plc
|
|
12
|
+
from pylibcudf import expressions as plc_expr
|
|
13
|
+
|
|
14
|
+
from cudf_polars.containers import DataType
|
|
15
|
+
from cudf_polars.dsl import expr
|
|
16
|
+
from cudf_polars.dsl.traversal import CachingVisitor, reuse_if_unchanged
|
|
17
|
+
from cudf_polars.typing import GenericTransformer
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from collections.abc import Mapping
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# Can't merge these op-mapping dictionaries because scoped enum values
|
|
24
|
+
# are exposed by cython with equality/hash based one their underlying
|
|
25
|
+
# representation type. So in a dict they are just treated as integers.
|
|
26
|
+
BINOP_TO_ASTOP = {
|
|
27
|
+
plc.binaryop.BinaryOperator.EQUAL: plc_expr.ASTOperator.EQUAL,
|
|
28
|
+
plc.binaryop.BinaryOperator.NULL_EQUALS: plc_expr.ASTOperator.NULL_EQUAL,
|
|
29
|
+
plc.binaryop.BinaryOperator.NOT_EQUAL: plc_expr.ASTOperator.NOT_EQUAL,
|
|
30
|
+
plc.binaryop.BinaryOperator.LESS: plc_expr.ASTOperator.LESS,
|
|
31
|
+
plc.binaryop.BinaryOperator.LESS_EQUAL: plc_expr.ASTOperator.LESS_EQUAL,
|
|
32
|
+
plc.binaryop.BinaryOperator.GREATER: plc_expr.ASTOperator.GREATER,
|
|
33
|
+
plc.binaryop.BinaryOperator.GREATER_EQUAL: plc_expr.ASTOperator.GREATER_EQUAL,
|
|
34
|
+
plc.binaryop.BinaryOperator.ADD: plc_expr.ASTOperator.ADD,
|
|
35
|
+
plc.binaryop.BinaryOperator.SUB: plc_expr.ASTOperator.SUB,
|
|
36
|
+
plc.binaryop.BinaryOperator.MUL: plc_expr.ASTOperator.MUL,
|
|
37
|
+
plc.binaryop.BinaryOperator.DIV: plc_expr.ASTOperator.DIV,
|
|
38
|
+
plc.binaryop.BinaryOperator.TRUE_DIV: plc_expr.ASTOperator.TRUE_DIV,
|
|
39
|
+
plc.binaryop.BinaryOperator.FLOOR_DIV: plc_expr.ASTOperator.FLOOR_DIV,
|
|
40
|
+
plc.binaryop.BinaryOperator.PYMOD: plc_expr.ASTOperator.PYMOD,
|
|
41
|
+
plc.binaryop.BinaryOperator.BITWISE_AND: plc_expr.ASTOperator.BITWISE_AND,
|
|
42
|
+
plc.binaryop.BinaryOperator.BITWISE_OR: plc_expr.ASTOperator.BITWISE_OR,
|
|
43
|
+
plc.binaryop.BinaryOperator.BITWISE_XOR: plc_expr.ASTOperator.BITWISE_XOR,
|
|
44
|
+
plc.binaryop.BinaryOperator.LOGICAL_AND: plc_expr.ASTOperator.LOGICAL_AND,
|
|
45
|
+
plc.binaryop.BinaryOperator.LOGICAL_OR: plc_expr.ASTOperator.LOGICAL_OR,
|
|
46
|
+
plc.binaryop.BinaryOperator.NULL_LOGICAL_AND: plc_expr.ASTOperator.NULL_LOGICAL_AND,
|
|
47
|
+
plc.binaryop.BinaryOperator.NULL_LOGICAL_OR: plc_expr.ASTOperator.NULL_LOGICAL_OR,
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
UOP_TO_ASTOP = {
|
|
51
|
+
plc.unary.UnaryOperator.SIN: plc_expr.ASTOperator.SIN,
|
|
52
|
+
plc.unary.UnaryOperator.COS: plc_expr.ASTOperator.COS,
|
|
53
|
+
plc.unary.UnaryOperator.TAN: plc_expr.ASTOperator.TAN,
|
|
54
|
+
plc.unary.UnaryOperator.ARCSIN: plc_expr.ASTOperator.ARCSIN,
|
|
55
|
+
plc.unary.UnaryOperator.ARCCOS: plc_expr.ASTOperator.ARCCOS,
|
|
56
|
+
plc.unary.UnaryOperator.ARCTAN: plc_expr.ASTOperator.ARCTAN,
|
|
57
|
+
plc.unary.UnaryOperator.SINH: plc_expr.ASTOperator.SINH,
|
|
58
|
+
plc.unary.UnaryOperator.COSH: plc_expr.ASTOperator.COSH,
|
|
59
|
+
plc.unary.UnaryOperator.TANH: plc_expr.ASTOperator.TANH,
|
|
60
|
+
plc.unary.UnaryOperator.ARCSINH: plc_expr.ASTOperator.ARCSINH,
|
|
61
|
+
plc.unary.UnaryOperator.ARCCOSH: plc_expr.ASTOperator.ARCCOSH,
|
|
62
|
+
plc.unary.UnaryOperator.ARCTANH: plc_expr.ASTOperator.ARCTANH,
|
|
63
|
+
plc.unary.UnaryOperator.EXP: plc_expr.ASTOperator.EXP,
|
|
64
|
+
plc.unary.UnaryOperator.LOG: plc_expr.ASTOperator.LOG,
|
|
65
|
+
plc.unary.UnaryOperator.SQRT: plc_expr.ASTOperator.SQRT,
|
|
66
|
+
plc.unary.UnaryOperator.CBRT: plc_expr.ASTOperator.CBRT,
|
|
67
|
+
plc.unary.UnaryOperator.CEIL: plc_expr.ASTOperator.CEIL,
|
|
68
|
+
plc.unary.UnaryOperator.FLOOR: plc_expr.ASTOperator.FLOOR,
|
|
69
|
+
plc.unary.UnaryOperator.ABS: plc_expr.ASTOperator.ABS,
|
|
70
|
+
plc.unary.UnaryOperator.RINT: plc_expr.ASTOperator.RINT,
|
|
71
|
+
plc.unary.UnaryOperator.BIT_INVERT: plc_expr.ASTOperator.BIT_INVERT,
|
|
72
|
+
plc.unary.UnaryOperator.NOT: plc_expr.ASTOperator.NOT,
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
SUPPORTED_STATISTICS_BINOPS = {
|
|
76
|
+
plc.binaryop.BinaryOperator.EQUAL,
|
|
77
|
+
plc.binaryop.BinaryOperator.NOT_EQUAL,
|
|
78
|
+
plc.binaryop.BinaryOperator.LESS,
|
|
79
|
+
plc.binaryop.BinaryOperator.LESS_EQUAL,
|
|
80
|
+
plc.binaryop.BinaryOperator.GREATER,
|
|
81
|
+
plc.binaryop.BinaryOperator.GREATER_EQUAL,
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
REVERSED_COMPARISON = {
|
|
85
|
+
plc.binaryop.BinaryOperator.EQUAL: plc.binaryop.BinaryOperator.EQUAL,
|
|
86
|
+
plc.binaryop.BinaryOperator.NOT_EQUAL: plc.binaryop.BinaryOperator.NOT_EQUAL,
|
|
87
|
+
plc.binaryop.BinaryOperator.LESS: plc.binaryop.BinaryOperator.GREATER,
|
|
88
|
+
plc.binaryop.BinaryOperator.LESS_EQUAL: plc.binaryop.BinaryOperator.GREATER_EQUAL,
|
|
89
|
+
plc.binaryop.BinaryOperator.GREATER: plc.binaryop.BinaryOperator.LESS,
|
|
90
|
+
plc.binaryop.BinaryOperator.GREATER_EQUAL: plc.binaryop.BinaryOperator.LESS_EQUAL,
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class ASTState(TypedDict):
|
|
95
|
+
"""
|
|
96
|
+
State for AST transformations.
|
|
97
|
+
|
|
98
|
+
Parameters
|
|
99
|
+
----------
|
|
100
|
+
for_parquet
|
|
101
|
+
Indicator for whether this transformation should provide an expression
|
|
102
|
+
suitable for use in parquet filters.
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
for_parquet: bool
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class ExprTransformerState(TypedDict):
|
|
109
|
+
"""
|
|
110
|
+
State used for AST transformation when inserting column references.
|
|
111
|
+
|
|
112
|
+
Parameters
|
|
113
|
+
----------
|
|
114
|
+
name_to_index
|
|
115
|
+
Mapping from column names to column indices in the table
|
|
116
|
+
eventually used for evaluation.
|
|
117
|
+
table_ref
|
|
118
|
+
pylibcudf `TableReference` indicating whether column
|
|
119
|
+
references are coming from the left or right table.
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
name_to_index: Mapping[str, int]
|
|
123
|
+
table_ref: plc.expressions.TableReference
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
Transformer: TypeAlias = GenericTransformer[expr.Expr, plc_expr.Expression, ASTState]
|
|
127
|
+
ExprTransformer: TypeAlias = GenericTransformer[
|
|
128
|
+
expr.Expr, expr.Expr, ExprTransformerState
|
|
129
|
+
]
|
|
130
|
+
"""Protocol for transformation of Expr nodes."""
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
@singledispatch
|
|
134
|
+
def _to_ast(node: expr.Expr, self: Transformer) -> plc_expr.Expression:
|
|
135
|
+
"""
|
|
136
|
+
Translate an expression to a pylibcudf Expression.
|
|
137
|
+
|
|
138
|
+
Parameters
|
|
139
|
+
----------
|
|
140
|
+
node
|
|
141
|
+
Expression to translate.
|
|
142
|
+
self
|
|
143
|
+
Recursive transformer. The state dictionary is an instance of
|
|
144
|
+
:class:`ASTState`.
|
|
145
|
+
|
|
146
|
+
Returns
|
|
147
|
+
-------
|
|
148
|
+
pylibcudf Expression.
|
|
149
|
+
|
|
150
|
+
Raises
|
|
151
|
+
------
|
|
152
|
+
NotImplementedError or KeyError if the expression cannot be translated.
|
|
153
|
+
"""
|
|
154
|
+
raise NotImplementedError(f"Unhandled expression type {type(node)}")
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
@_to_ast.register
|
|
158
|
+
def _(node: expr.Col, self: Transformer) -> plc_expr.Expression:
|
|
159
|
+
if self.state["for_parquet"]:
|
|
160
|
+
return plc_expr.ColumnNameReference(node.name)
|
|
161
|
+
raise TypeError("Should always be wrapped in a ColRef node before translation")
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
@_to_ast.register
|
|
165
|
+
def _(node: expr.ColRef, self: Transformer) -> plc_expr.Expression:
|
|
166
|
+
if self.state["for_parquet"]:
|
|
167
|
+
raise TypeError("Not expecting ColRef node in parquet filter")
|
|
168
|
+
return plc_expr.ColumnReference(node.index, node.table_ref)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
@_to_ast.register
|
|
172
|
+
def _(node: expr.Literal, self: Transformer) -> plc_expr.Expression:
|
|
173
|
+
return plc_expr.Literal(plc.Scalar.from_py(node.value, node.dtype.plc))
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
@_to_ast.register
|
|
177
|
+
def _(node: expr.BinOp, self: Transformer) -> plc_expr.Expression:
|
|
178
|
+
if node.op == plc.binaryop.BinaryOperator.NULL_NOT_EQUALS:
|
|
179
|
+
return plc_expr.Operation(
|
|
180
|
+
plc_expr.ASTOperator.NOT,
|
|
181
|
+
self(
|
|
182
|
+
# Reconstruct and apply, rather than directly
|
|
183
|
+
# constructing the right expression so we get the
|
|
184
|
+
# handling of parquet special cases for free.
|
|
185
|
+
expr.BinOp(
|
|
186
|
+
node.dtype, plc.binaryop.BinaryOperator.NULL_EQUALS, *node.children
|
|
187
|
+
)
|
|
188
|
+
),
|
|
189
|
+
)
|
|
190
|
+
if self.state["for_parquet"]:
|
|
191
|
+
op1_col, op2_col = (isinstance(op, expr.Col) for op in node.children)
|
|
192
|
+
if op1_col ^ op2_col:
|
|
193
|
+
op = node.op
|
|
194
|
+
if op not in SUPPORTED_STATISTICS_BINOPS:
|
|
195
|
+
raise NotImplementedError(
|
|
196
|
+
f"Parquet filter binop with column doesn't support {node.op!r}"
|
|
197
|
+
)
|
|
198
|
+
op1, op2 = node.children
|
|
199
|
+
if op2_col:
|
|
200
|
+
(op1, op2) = (op2, op1)
|
|
201
|
+
op = REVERSED_COMPARISON[op]
|
|
202
|
+
if not isinstance(op2, expr.Literal):
|
|
203
|
+
raise NotImplementedError(
|
|
204
|
+
"Parquet filter binops must have form 'col binop literal'"
|
|
205
|
+
)
|
|
206
|
+
return plc_expr.Operation(BINOP_TO_ASTOP[op], self(op1), self(op2))
|
|
207
|
+
elif op1_col and op2_col:
|
|
208
|
+
raise NotImplementedError(
|
|
209
|
+
"Parquet filter binops must have one column reference not two"
|
|
210
|
+
)
|
|
211
|
+
return plc_expr.Operation(BINOP_TO_ASTOP[node.op], *map(self, node.children))
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
@_to_ast.register
|
|
215
|
+
def _(node: expr.BooleanFunction, self: Transformer) -> plc_expr.Expression:
|
|
216
|
+
if node.name is expr.BooleanFunction.Name.IsIn:
|
|
217
|
+
needles, haystack = node.children
|
|
218
|
+
if isinstance(haystack, expr.LiteralColumn) and len(haystack.value) < 16:
|
|
219
|
+
# 16 is an arbitrary limit
|
|
220
|
+
needle_ref = self(needles)
|
|
221
|
+
if haystack.dtype.id() == plc.TypeId.LIST:
|
|
222
|
+
# Because we originally translated pl_expr.Literal with a list scalar
|
|
223
|
+
# to a expr.LiteralColumn, so the actual type is in the inner type
|
|
224
|
+
#
|
|
225
|
+
# the type-ignore is safe because the for plc.TypeID.LIST, we know
|
|
226
|
+
# we have a polars.List type, which has an inner attribute.
|
|
227
|
+
plc_dtype = DataType(haystack.dtype.polars.inner).plc # type: ignore[attr-defined]
|
|
228
|
+
else:
|
|
229
|
+
plc_dtype = haystack.dtype.plc # pragma: no cover
|
|
230
|
+
values = (
|
|
231
|
+
plc_expr.Literal(plc.Scalar.from_py(val, plc_dtype))
|
|
232
|
+
for val in haystack.value
|
|
233
|
+
)
|
|
234
|
+
return reduce(
|
|
235
|
+
partial(plc_expr.Operation, plc_expr.ASTOperator.LOGICAL_OR),
|
|
236
|
+
(
|
|
237
|
+
plc_expr.Operation(plc_expr.ASTOperator.EQUAL, needle_ref, value)
|
|
238
|
+
for value in values
|
|
239
|
+
),
|
|
240
|
+
)
|
|
241
|
+
if self.state["for_parquet"] and isinstance(node.children[0], expr.Col):
|
|
242
|
+
raise NotImplementedError(
|
|
243
|
+
f"Parquet filters don't support {node.name} on columns"
|
|
244
|
+
)
|
|
245
|
+
if node.name is expr.BooleanFunction.Name.IsNull:
|
|
246
|
+
return plc_expr.Operation(plc_expr.ASTOperator.IS_NULL, self(node.children[0]))
|
|
247
|
+
elif node.name is expr.BooleanFunction.Name.IsNotNull:
|
|
248
|
+
return plc_expr.Operation(
|
|
249
|
+
plc_expr.ASTOperator.NOT,
|
|
250
|
+
plc_expr.Operation(plc_expr.ASTOperator.IS_NULL, self(node.children[0])),
|
|
251
|
+
)
|
|
252
|
+
elif node.name is expr.BooleanFunction.Name.Not:
|
|
253
|
+
return plc_expr.Operation(plc_expr.ASTOperator.NOT, self(node.children[0]))
|
|
254
|
+
raise NotImplementedError(f"AST conversion does not support {node.name}")
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
@_to_ast.register
|
|
258
|
+
def _(node: expr.UnaryFunction, self: Transformer) -> plc_expr.Expression:
|
|
259
|
+
if isinstance(node.children[0], expr.Col) and self.state["for_parquet"]:
|
|
260
|
+
raise NotImplementedError(
|
|
261
|
+
"Parquet filters don't support {node.name} on columns"
|
|
262
|
+
)
|
|
263
|
+
return plc_expr.Operation(
|
|
264
|
+
UOP_TO_ASTOP[node._OP_MAPPING[node.name]], self(node.children[0])
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def to_parquet_filter(node: expr.Expr) -> plc_expr.Expression | None:
|
|
269
|
+
"""
|
|
270
|
+
Convert an expression to libcudf AST nodes suitable for parquet filtering.
|
|
271
|
+
|
|
272
|
+
Parameters
|
|
273
|
+
----------
|
|
274
|
+
node
|
|
275
|
+
Expression to convert.
|
|
276
|
+
|
|
277
|
+
Returns
|
|
278
|
+
-------
|
|
279
|
+
pylibcudf Expression if conversion is possible, otherwise None.
|
|
280
|
+
"""
|
|
281
|
+
mapper: Transformer = CachingVisitor(_to_ast, state={"for_parquet": True})
|
|
282
|
+
try:
|
|
283
|
+
return mapper(node)
|
|
284
|
+
except (KeyError, NotImplementedError):
|
|
285
|
+
return None
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def to_ast(node: expr.Expr) -> plc_expr.Expression | None:
|
|
289
|
+
"""
|
|
290
|
+
Convert an expression to libcudf AST nodes suitable for compute_column.
|
|
291
|
+
|
|
292
|
+
Parameters
|
|
293
|
+
----------
|
|
294
|
+
node
|
|
295
|
+
Expression to convert.
|
|
296
|
+
|
|
297
|
+
Notes
|
|
298
|
+
-----
|
|
299
|
+
`Col` nodes must always be wrapped in `TableRef` nodes when
|
|
300
|
+
converting to an ast expression so that their table reference and
|
|
301
|
+
index are provided.
|
|
302
|
+
|
|
303
|
+
Returns
|
|
304
|
+
-------
|
|
305
|
+
pylibcudf Expression if conversion is possible, otherwise None.
|
|
306
|
+
"""
|
|
307
|
+
mapper: Transformer = CachingVisitor(_to_ast, state={"for_parquet": False})
|
|
308
|
+
try:
|
|
309
|
+
return mapper(node)
|
|
310
|
+
except (KeyError, NotImplementedError):
|
|
311
|
+
return None
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def _insert_colrefs(node: expr.Expr, rec: ExprTransformer) -> expr.Expr:
|
|
315
|
+
if isinstance(node, expr.Col):
|
|
316
|
+
return expr.ColRef(
|
|
317
|
+
node.dtype,
|
|
318
|
+
rec.state["name_to_index"][node.name],
|
|
319
|
+
rec.state["table_ref"],
|
|
320
|
+
node,
|
|
321
|
+
)
|
|
322
|
+
return reuse_if_unchanged(node, rec)
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def insert_colrefs(
|
|
326
|
+
node: expr.Expr,
|
|
327
|
+
*,
|
|
328
|
+
table_ref: plc.expressions.TableReference,
|
|
329
|
+
name_to_index: Mapping[str, int],
|
|
330
|
+
) -> expr.Expr:
|
|
331
|
+
"""
|
|
332
|
+
Insert column references into an expression before conversion to libcudf AST.
|
|
333
|
+
|
|
334
|
+
Parameters
|
|
335
|
+
----------
|
|
336
|
+
node
|
|
337
|
+
Expression to insert references into.
|
|
338
|
+
table_ref
|
|
339
|
+
pylibcudf `TableReference` indicating whether column
|
|
340
|
+
references are coming from the left or right table.
|
|
341
|
+
name_to_index:
|
|
342
|
+
Mapping from column names to column indices in the table
|
|
343
|
+
eventually used for evaluation.
|
|
344
|
+
|
|
345
|
+
Notes
|
|
346
|
+
-----
|
|
347
|
+
All column references are wrapped in the same, singular, table
|
|
348
|
+
reference, so this function relies on the expression only
|
|
349
|
+
containing column references from a single table.
|
|
350
|
+
|
|
351
|
+
Returns
|
|
352
|
+
-------
|
|
353
|
+
New expression with column references inserted.
|
|
354
|
+
"""
|
|
355
|
+
mapper: ExprTransformer = CachingVisitor(
|
|
356
|
+
_insert_colrefs,
|
|
357
|
+
state={"name_to_index": name_to_index, "table_ref": table_ref},
|
|
358
|
+
)
|
|
359
|
+
return mapper(node)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
"""Utilities for tracing and monitoring IR execution."""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import functools
|
|
9
|
+
|
|
10
|
+
import nvtx
|
|
11
|
+
|
|
12
|
+
CUDF_POLARS_NVTX_DOMAIN = "cudf_polars"
|
|
13
|
+
|
|
14
|
+
nvtx_annotate_cudf_polars = functools.partial(
|
|
15
|
+
nvtx.annotate, domain=CUDF_POLARS_NVTX_DOMAIN
|
|
16
|
+
)
|