cudf-polars-cu12 24.8.0a281__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cudf_polars/VERSION ADDED
@@ -0,0 +1 @@
1
+ 24.08.00a281
@@ -0,0 +1,22 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ """
5
+ An executor for polars logical plans.
6
+
7
+ This package implements an executor for polars logical plans using
8
+ pylibcudf to execute the plans on device.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from cudf_polars._version import __git_commit__, __version__
14
+ from cudf_polars.callback import execute_with_cudf
15
+ from cudf_polars.dsl.translate import translate_ir
16
+
17
+ __all__: list[str] = [
18
+ "execute_with_cudf",
19
+ "translate_ir",
20
+ "__git_commit__",
21
+ "__version__",
22
+ ]
@@ -0,0 +1,21 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from __future__ import annotations
5
+
6
+ import importlib.resources
7
+
8
+ __version__ = (
9
+ importlib.resources.files(__package__).joinpath("VERSION").read_text().strip()
10
+ )
11
+ try:
12
+ __git_commit__ = (
13
+ importlib.resources.files(__package__)
14
+ .joinpath("GIT_COMMIT")
15
+ .read_text()
16
+ .strip()
17
+ )
18
+ except FileNotFoundError:
19
+ __git_commit__ = ""
20
+
21
+ __all__ = ["__git_commit__", "__version__"]
@@ -0,0 +1,66 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ """Callback for the polars collect function to execute on device."""
5
+
6
+ from __future__ import annotations
7
+
8
+ from functools import partial
9
+ from typing import TYPE_CHECKING
10
+
11
+ import nvtx
12
+
13
+ from cudf_polars.dsl.translate import translate_ir
14
+
15
+ if TYPE_CHECKING:
16
+ import polars as pl
17
+
18
+ from cudf_polars.dsl.ir import IR
19
+ from cudf_polars.typing import NodeTraverser
20
+
21
+ __all__: list[str] = ["execute_with_cudf"]
22
+
23
+
24
+ def _callback(
25
+ ir: IR,
26
+ with_columns: list[str] | None,
27
+ pyarrow_predicate: str | None,
28
+ n_rows: int | None,
29
+ ) -> pl.DataFrame:
30
+ assert with_columns is None
31
+ assert pyarrow_predicate is None
32
+ assert n_rows is None
33
+ with nvtx.annotate(message="ExecuteIR", domain="cudf_polars"):
34
+ return ir.evaluate(cache={}).to_polars()
35
+
36
+
37
+ def execute_with_cudf(
38
+ nt: NodeTraverser,
39
+ *,
40
+ raise_on_fail: bool = False,
41
+ exception: type[Exception] | tuple[type[Exception], ...] = Exception,
42
+ ) -> None:
43
+ """
44
+ A post optimization callback that attempts to execute the plan with cudf.
45
+
46
+ Parameters
47
+ ----------
48
+ nt
49
+ NodeTraverser
50
+
51
+ raise_on_fail
52
+ Should conversion raise an exception rather than continuing
53
+ without setting a callback.
54
+
55
+ exception
56
+ Optional exception, or tuple of exceptions, to catch during
57
+ translation. Defaults to ``Exception``.
58
+
59
+ The NodeTraverser is mutated if the libcudf executor can handle the plan.
60
+ """
61
+ try:
62
+ with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
63
+ nt.set_udf(partial(_callback, translate_ir(nt)))
64
+ except exception:
65
+ if raise_on_fail:
66
+ raise
@@ -0,0 +1,11 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ """Containers of concrete data."""
5
+
6
+ from __future__ import annotations
7
+
8
+ __all__: list[str] = ["DataFrame", "Column", "NamedColumn"]
9
+
10
+ from cudf_polars.containers.column import Column, NamedColumn
11
+ from cudf_polars.containers.dataframe import DataFrame
@@ -0,0 +1,189 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ """A column, with some properties."""
5
+
6
+ from __future__ import annotations
7
+
8
+ import functools
9
+ from typing import TYPE_CHECKING
10
+
11
+ import cudf._lib.pylibcudf as plc
12
+
13
+ if TYPE_CHECKING:
14
+ from typing_extensions import Self
15
+
16
+ __all__: list[str] = ["Column", "NamedColumn"]
17
+
18
+
19
+ class Column:
20
+ """An immutable column with sortedness metadata."""
21
+
22
+ obj: plc.Column
23
+ is_sorted: plc.types.Sorted
24
+ order: plc.types.Order
25
+ null_order: plc.types.NullOrder
26
+ is_scalar: bool
27
+
28
+ def __init__(
29
+ self,
30
+ column: plc.Column,
31
+ *,
32
+ is_sorted: plc.types.Sorted = plc.types.Sorted.NO,
33
+ order: plc.types.Order = plc.types.Order.ASCENDING,
34
+ null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE,
35
+ ):
36
+ self.obj = column
37
+ self.is_scalar = self.obj.size() == 1
38
+ if self.obj.size() <= 1:
39
+ is_sorted = plc.types.Sorted.YES
40
+ self.is_sorted = is_sorted
41
+ self.order = order
42
+ self.null_order = null_order
43
+
44
+ @functools.cached_property
45
+ def obj_scalar(self) -> plc.Scalar:
46
+ """
47
+ A copy of the column object as a pylibcudf Scalar.
48
+
49
+ Returns
50
+ -------
51
+ pylibcudf Scalar object.
52
+
53
+ Raises
54
+ ------
55
+ ValueError
56
+ If the column is not length-1.
57
+ """
58
+ if not self.is_scalar:
59
+ raise ValueError(
60
+ f"Cannot convert a column of length {self.obj.size()} to scalar"
61
+ )
62
+ return plc.copying.get_element(self.obj, 0)
63
+
64
+ def sorted_like(self, like: Column, /) -> Self:
65
+ """
66
+ Copy sortedness properties from a column onto self.
67
+
68
+ Parameters
69
+ ----------
70
+ like
71
+ The column to copy sortedness metadata from.
72
+
73
+ Returns
74
+ -------
75
+ Self with metadata set.
76
+
77
+ See Also
78
+ --------
79
+ set_sorted
80
+ """
81
+ return self.set_sorted(
82
+ is_sorted=like.is_sorted, order=like.order, null_order=like.null_order
83
+ )
84
+
85
+ def set_sorted(
86
+ self,
87
+ *,
88
+ is_sorted: plc.types.Sorted,
89
+ order: plc.types.Order,
90
+ null_order: plc.types.NullOrder,
91
+ ) -> Self:
92
+ """
93
+ Modify sortedness metadata in place.
94
+
95
+ Parameters
96
+ ----------
97
+ is_sorted
98
+ Is the column sorted
99
+ order
100
+ The order if sorted
101
+ null_order
102
+ Where nulls sort, if sorted
103
+
104
+ Returns
105
+ -------
106
+ Self with metadata set.
107
+ """
108
+ if self.obj.size() <= 1:
109
+ is_sorted = plc.types.Sorted.YES
110
+ self.is_sorted = is_sorted
111
+ self.order = order
112
+ self.null_order = null_order
113
+ return self
114
+
115
+ def copy(self) -> Self:
116
+ """
117
+ A shallow copy of the column.
118
+
119
+ Returns
120
+ -------
121
+ New column sharing data with self.
122
+ """
123
+ return type(self)(
124
+ self.obj,
125
+ is_sorted=self.is_sorted,
126
+ order=self.order,
127
+ null_order=self.null_order,
128
+ )
129
+
130
+ def mask_nans(self) -> Self:
131
+ """Return a copy of self with nans masked out."""
132
+ if self.nan_count > 0:
133
+ raise NotImplementedError("Need to port transform.hpp to pylibcudf")
134
+ return self.copy()
135
+
136
+ @functools.cached_property
137
+ def nan_count(self) -> int:
138
+ """Return the number of NaN values in the column."""
139
+ if self.obj.type().id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64):
140
+ return 0
141
+ return plc.interop.to_arrow(
142
+ plc.reduce.reduce(
143
+ plc.unary.is_nan(self.obj),
144
+ plc.aggregation.sum(),
145
+ # TODO: pylibcudf needs to have a SizeType DataType singleton
146
+ plc.DataType(plc.TypeId.INT32),
147
+ )
148
+ ).as_py()
149
+
150
+
151
+ class NamedColumn(Column):
152
+ """A column with a name."""
153
+
154
+ name: str
155
+
156
+ def __init__(
157
+ self,
158
+ column: plc.Column,
159
+ name: str,
160
+ *,
161
+ is_sorted: plc.types.Sorted = plc.types.Sorted.NO,
162
+ order: plc.types.Order = plc.types.Order.ASCENDING,
163
+ null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE,
164
+ ) -> None:
165
+ super().__init__(
166
+ column, is_sorted=is_sorted, order=order, null_order=null_order
167
+ )
168
+ self.name = name
169
+
170
+ def copy(self, *, new_name: str | None = None) -> Self:
171
+ """
172
+ A shallow copy of the column.
173
+
174
+ Parameters
175
+ ----------
176
+ new_name
177
+ Optional new name for the copied column.
178
+
179
+ Returns
180
+ -------
181
+ New column sharing data with self.
182
+ """
183
+ return type(self)(
184
+ self.obj,
185
+ self.name if new_name is None else new_name,
186
+ is_sorted=self.is_sorted,
187
+ order=self.order,
188
+ null_order=self.null_order,
189
+ )
@@ -0,0 +1,226 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ """A dataframe, with some properties."""
5
+
6
+ from __future__ import annotations
7
+
8
+ import itertools
9
+ from functools import cached_property
10
+ from typing import TYPE_CHECKING, cast
11
+
12
+ import polars as pl
13
+
14
+ import cudf._lib.pylibcudf as plc
15
+
16
+ from cudf_polars.containers.column import NamedColumn
17
+
18
+ if TYPE_CHECKING:
19
+ from collections.abc import Mapping, Sequence, Set
20
+
21
+ import pyarrow as pa
22
+ from typing_extensions import Self
23
+
24
+ import cudf
25
+
26
+ from cudf_polars.containers import Column
27
+
28
+
29
+ __all__: list[str] = ["DataFrame"]
30
+
31
+
32
+ class DataFrame:
33
+ """A representation of a dataframe."""
34
+
35
+ columns: list[NamedColumn]
36
+ table: plc.Table
37
+
38
+ def __init__(self, columns: Sequence[NamedColumn]) -> None:
39
+ self.columns = list(columns)
40
+ self._column_map = {c.name: c for c in self.columns}
41
+ self.table = plc.Table([c.obj for c in columns])
42
+
43
+ def copy(self) -> Self:
44
+ """Return a shallow copy of self."""
45
+ return type(self)([c.copy() for c in self.columns])
46
+
47
+ def to_polars(self) -> pl.DataFrame:
48
+ """Convert to a polars DataFrame."""
49
+ table: pa.Table = plc.interop.to_arrow(
50
+ self.table,
51
+ [plc.interop.ColumnMetadata(name=c.name) for c in self.columns],
52
+ )
53
+
54
+ return cast(pl.DataFrame, pl.from_arrow(table))
55
+
56
+ @cached_property
57
+ def column_names_set(self) -> frozenset[str]:
58
+ """Return the column names as a set."""
59
+ return frozenset(c.name for c in self.columns)
60
+
61
+ @cached_property
62
+ def column_names(self) -> list[str]:
63
+ """Return a list of the column names."""
64
+ return [c.name for c in self.columns]
65
+
66
+ @cached_property
67
+ def num_columns(self) -> int:
68
+ """Number of columns."""
69
+ return len(self.columns)
70
+
71
+ @cached_property
72
+ def num_rows(self) -> int:
73
+ """Number of rows."""
74
+ return 0 if len(self.columns) == 0 else self.table.num_rows()
75
+
76
+ @classmethod
77
+ def from_cudf(cls, df: cudf.DataFrame) -> Self:
78
+ """Create from a cudf dataframe."""
79
+ return cls(
80
+ [
81
+ NamedColumn(c.to_pylibcudf(mode="read"), name)
82
+ for name, c in df._data.items()
83
+ ]
84
+ )
85
+
86
+ @classmethod
87
+ def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self:
88
+ """
89
+ Create from a pylibcudf table.
90
+
91
+ Parameters
92
+ ----------
93
+ table
94
+ Pylibcudf table to obtain columns from
95
+ names
96
+ Names for the columns
97
+
98
+ Returns
99
+ -------
100
+ New dataframe sharing data with the input table.
101
+
102
+ Raises
103
+ ------
104
+ ValueError
105
+ If the number of provided names does not match the
106
+ number of columns in the table.
107
+ """
108
+ if table.num_columns() != len(names):
109
+ raise ValueError("Mismatching name and table length.")
110
+ return cls(
111
+ # TODO: strict=True when we drop py39
112
+ [NamedColumn(c, name) for c, name in zip(table.columns(), names)]
113
+ )
114
+
115
+ def sorted_like(
116
+ self, like: DataFrame, /, *, subset: Set[str] | None = None
117
+ ) -> Self:
118
+ """
119
+ Copy sortedness from a dataframe onto self.
120
+
121
+ Parameters
122
+ ----------
123
+ like
124
+ The dataframe to copy from
125
+ subset
126
+ Optional subset of columns from which to copy data.
127
+
128
+ Returns
129
+ -------
130
+ Self with metadata set.
131
+
132
+ Raises
133
+ ------
134
+ ValueError
135
+ If there is a name mismatch between self and like.
136
+ """
137
+ if like.column_names != self.column_names:
138
+ raise ValueError("Can only copy from identically named frame")
139
+ subset = self.column_names_set if subset is None else subset
140
+ self.columns = [
141
+ c.sorted_like(other) if c.name in subset else c
142
+ # TODO: strict=True when we drop py39
143
+ for c, other in zip(self.columns, like.columns)
144
+ ]
145
+ return self
146
+
147
+ def with_columns(self, columns: Sequence[NamedColumn]) -> Self:
148
+ """
149
+ Return a new dataframe with extra columns.
150
+
151
+ Parameters
152
+ ----------
153
+ columns
154
+ Columns to add
155
+
156
+ Returns
157
+ -------
158
+ New dataframe
159
+
160
+ Notes
161
+ -----
162
+ If column names overlap, newer names replace older ones.
163
+ """
164
+ columns = list(
165
+ {c.name: c for c in itertools.chain(self.columns, columns)}.values()
166
+ )
167
+ return type(self)(columns)
168
+
169
+ def discard_columns(self, names: Set[str]) -> Self:
170
+ """Drop columns by name."""
171
+ return type(self)([c for c in self.columns if c.name not in names])
172
+
173
+ def select(self, names: Sequence[str]) -> Self:
174
+ """Select columns by name returning DataFrame."""
175
+ want = set(names)
176
+ if not want.issubset(self.column_names_set):
177
+ raise ValueError("Can't select missing names")
178
+ return type(self)([self._column_map[name] for name in names])
179
+
180
+ def replace_columns(self, *columns: NamedColumn) -> Self:
181
+ """Return a new dataframe with columns replaced by name."""
182
+ new = {c.name: c for c in columns}
183
+ if not set(new).issubset(self.column_names_set):
184
+ raise ValueError("Cannot replace with non-existing names")
185
+ return type(self)([new.get(c.name, c) for c in self.columns])
186
+
187
+ def rename_columns(self, mapping: Mapping[str, str]) -> Self:
188
+ """Rename some columns."""
189
+ return type(self)([c.copy(new_name=mapping.get(c.name)) for c in self.columns])
190
+
191
+ def select_columns(self, names: Set[str]) -> list[NamedColumn]:
192
+ """Select columns by name."""
193
+ return [c for c in self.columns if c.name in names]
194
+
195
+ def filter(self, mask: Column) -> Self:
196
+ """Return a filtered table given a mask."""
197
+ table = plc.stream_compaction.apply_boolean_mask(self.table, mask.obj)
198
+ return type(self).from_table(table, self.column_names).sorted_like(self)
199
+
200
+ def slice(self, zlice: tuple[int, int] | None) -> Self:
201
+ """
202
+ Slice a dataframe.
203
+
204
+ Parameters
205
+ ----------
206
+ zlice
207
+ optional, tuple of start and length, negative values of start
208
+ treated as for python indexing. If not provided, returns self.
209
+
210
+ Returns
211
+ -------
212
+ New dataframe (if zlice is not None) otherwise self (if it is)
213
+ """
214
+ if zlice is None:
215
+ return self
216
+ start, length = zlice
217
+ if start < 0:
218
+ start += self.num_rows
219
+ # Polars implementation wraps negative start by num_rows, then
220
+ # adds length to start to get the end, then clamps both to
221
+ # [0, num_rows)
222
+ end = start + length
223
+ start = max(min(start, self.num_rows), 0)
224
+ end = max(min(end, self.num_rows), 0)
225
+ (table,) = plc.copying.slice(self.table, [start, end])
226
+ return type(self).from_table(table, self.column_names).sorted_like(self)
@@ -0,0 +1,8 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ """The domain-specific language (DSL) for the polars executor."""
5
+
6
+ from __future__ import annotations
7
+
8
+ __all__: list[str] = []