cudf-polars-cu13 25.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudf_polars/GIT_COMMIT +1 -0
- cudf_polars/VERSION +1 -0
- cudf_polars/__init__.py +28 -0
- cudf_polars/_version.py +21 -0
- cudf_polars/callback.py +318 -0
- cudf_polars/containers/__init__.py +13 -0
- cudf_polars/containers/column.py +495 -0
- cudf_polars/containers/dataframe.py +361 -0
- cudf_polars/containers/datatype.py +137 -0
- cudf_polars/dsl/__init__.py +8 -0
- cudf_polars/dsl/expr.py +66 -0
- cudf_polars/dsl/expressions/__init__.py +8 -0
- cudf_polars/dsl/expressions/aggregation.py +226 -0
- cudf_polars/dsl/expressions/base.py +272 -0
- cudf_polars/dsl/expressions/binaryop.py +120 -0
- cudf_polars/dsl/expressions/boolean.py +326 -0
- cudf_polars/dsl/expressions/datetime.py +271 -0
- cudf_polars/dsl/expressions/literal.py +97 -0
- cudf_polars/dsl/expressions/rolling.py +643 -0
- cudf_polars/dsl/expressions/selection.py +74 -0
- cudf_polars/dsl/expressions/slicing.py +46 -0
- cudf_polars/dsl/expressions/sorting.py +85 -0
- cudf_polars/dsl/expressions/string.py +1002 -0
- cudf_polars/dsl/expressions/struct.py +137 -0
- cudf_polars/dsl/expressions/ternary.py +49 -0
- cudf_polars/dsl/expressions/unary.py +517 -0
- cudf_polars/dsl/ir.py +2607 -0
- cudf_polars/dsl/nodebase.py +164 -0
- cudf_polars/dsl/to_ast.py +359 -0
- cudf_polars/dsl/tracing.py +16 -0
- cudf_polars/dsl/translate.py +939 -0
- cudf_polars/dsl/traversal.py +224 -0
- cudf_polars/dsl/utils/__init__.py +8 -0
- cudf_polars/dsl/utils/aggregations.py +481 -0
- cudf_polars/dsl/utils/groupby.py +98 -0
- cudf_polars/dsl/utils/naming.py +34 -0
- cudf_polars/dsl/utils/replace.py +61 -0
- cudf_polars/dsl/utils/reshape.py +74 -0
- cudf_polars/dsl/utils/rolling.py +121 -0
- cudf_polars/dsl/utils/windows.py +192 -0
- cudf_polars/experimental/__init__.py +8 -0
- cudf_polars/experimental/base.py +386 -0
- cudf_polars/experimental/benchmarks/__init__.py +4 -0
- cudf_polars/experimental/benchmarks/pdsds.py +220 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/__init__.py +4 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q1.py +88 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q10.py +225 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q2.py +244 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q3.py +65 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q4.py +359 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q5.py +462 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q6.py +92 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q7.py +79 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q8.py +524 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q9.py +137 -0
- cudf_polars/experimental/benchmarks/pdsh.py +814 -0
- cudf_polars/experimental/benchmarks/utils.py +832 -0
- cudf_polars/experimental/dask_registers.py +200 -0
- cudf_polars/experimental/dispatch.py +156 -0
- cudf_polars/experimental/distinct.py +197 -0
- cudf_polars/experimental/explain.py +157 -0
- cudf_polars/experimental/expressions.py +590 -0
- cudf_polars/experimental/groupby.py +327 -0
- cudf_polars/experimental/io.py +943 -0
- cudf_polars/experimental/join.py +391 -0
- cudf_polars/experimental/parallel.py +423 -0
- cudf_polars/experimental/repartition.py +69 -0
- cudf_polars/experimental/scheduler.py +155 -0
- cudf_polars/experimental/select.py +188 -0
- cudf_polars/experimental/shuffle.py +354 -0
- cudf_polars/experimental/sort.py +609 -0
- cudf_polars/experimental/spilling.py +151 -0
- cudf_polars/experimental/statistics.py +795 -0
- cudf_polars/experimental/utils.py +169 -0
- cudf_polars/py.typed +0 -0
- cudf_polars/testing/__init__.py +8 -0
- cudf_polars/testing/asserts.py +448 -0
- cudf_polars/testing/io.py +122 -0
- cudf_polars/testing/plugin.py +236 -0
- cudf_polars/typing/__init__.py +219 -0
- cudf_polars/utils/__init__.py +8 -0
- cudf_polars/utils/config.py +741 -0
- cudf_polars/utils/conversion.py +40 -0
- cudf_polars/utils/dtypes.py +118 -0
- cudf_polars/utils/sorting.py +53 -0
- cudf_polars/utils/timer.py +39 -0
- cudf_polars/utils/versions.py +27 -0
- cudf_polars_cu13-25.10.0.dist-info/METADATA +136 -0
- cudf_polars_cu13-25.10.0.dist-info/RECORD +92 -0
- cudf_polars_cu13-25.10.0.dist-info/WHEEL +5 -0
- cudf_polars_cu13-25.10.0.dist-info/licenses/LICENSE +201 -0
- cudf_polars_cu13-25.10.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,361 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
"""A dataframe, with some properties."""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from functools import cached_property
|
|
9
|
+
from typing import TYPE_CHECKING, cast
|
|
10
|
+
|
|
11
|
+
import polars as pl
|
|
12
|
+
|
|
13
|
+
import pylibcudf as plc
|
|
14
|
+
|
|
15
|
+
from cudf_polars.containers import Column, DataType
|
|
16
|
+
from cudf_polars.utils import conversion
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from collections.abc import Iterable, Mapping, Sequence, Set
|
|
20
|
+
|
|
21
|
+
from typing_extensions import Any, CapsuleType, Self
|
|
22
|
+
|
|
23
|
+
from cudf_polars.typing import ColumnOptions, DataFrameHeader, PolarsDataType, Slice
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
__all__: list[str] = ["DataFrame"]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _create_polars_column_metadata(
|
|
30
|
+
name: str, dtype: PolarsDataType
|
|
31
|
+
) -> plc.interop.ColumnMetadata:
|
|
32
|
+
"""Create ColumnMetadata preserving dtype attributes not supported by libcudf."""
|
|
33
|
+
children_meta = []
|
|
34
|
+
timezone = ""
|
|
35
|
+
precision: int | None = None
|
|
36
|
+
|
|
37
|
+
if isinstance(dtype, pl.Struct):
|
|
38
|
+
children_meta = [
|
|
39
|
+
_create_polars_column_metadata(field.name, field.dtype)
|
|
40
|
+
for field in dtype.fields
|
|
41
|
+
]
|
|
42
|
+
elif isinstance(dtype, pl.Datetime):
|
|
43
|
+
timezone = dtype.time_zone or timezone
|
|
44
|
+
elif isinstance(dtype, pl.Decimal):
|
|
45
|
+
precision = dtype.precision
|
|
46
|
+
|
|
47
|
+
return plc.interop.ColumnMetadata(
|
|
48
|
+
name=name,
|
|
49
|
+
timezone=timezone,
|
|
50
|
+
precision=precision,
|
|
51
|
+
children_meta=children_meta,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# This is also defined in pylibcudf.interop
|
|
56
|
+
class _ObjectWithArrowMetadata:
|
|
57
|
+
def __init__(
|
|
58
|
+
self, obj: plc.Table | plc.Column, metadata: list[plc.interop.ColumnMetadata]
|
|
59
|
+
) -> None:
|
|
60
|
+
self.obj = obj
|
|
61
|
+
self.metadata = metadata
|
|
62
|
+
|
|
63
|
+
def __arrow_c_array__(
|
|
64
|
+
self, requested_schema: None = None
|
|
65
|
+
) -> tuple[CapsuleType, CapsuleType]:
|
|
66
|
+
return self.obj._to_schema(self.metadata), self.obj._to_host_array()
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# Pacify the type checker. DataFrame init asserts that all the columns
|
|
70
|
+
# have a string name, so let's narrow the type.
|
|
71
|
+
class NamedColumn(Column):
|
|
72
|
+
name: str
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class DataFrame:
|
|
76
|
+
"""A representation of a dataframe."""
|
|
77
|
+
|
|
78
|
+
column_map: dict[str, Column]
|
|
79
|
+
table: plc.Table
|
|
80
|
+
columns: list[NamedColumn]
|
|
81
|
+
|
|
82
|
+
def __init__(self, columns: Iterable[Column]) -> None:
|
|
83
|
+
columns = list(columns)
|
|
84
|
+
if any(c.name is None for c in columns):
|
|
85
|
+
raise ValueError("All columns must have a name")
|
|
86
|
+
self.columns = [cast(NamedColumn, c) for c in columns]
|
|
87
|
+
self.dtypes = [c.dtype for c in self.columns]
|
|
88
|
+
self.column_map = {c.name: c for c in self.columns}
|
|
89
|
+
self.table = plc.Table([c.obj for c in self.columns])
|
|
90
|
+
|
|
91
|
+
def copy(self) -> Self:
|
|
92
|
+
"""Return a shallow copy of self."""
|
|
93
|
+
return type(self)(c.copy() for c in self.columns)
|
|
94
|
+
|
|
95
|
+
def to_polars(self) -> pl.DataFrame:
|
|
96
|
+
"""Convert to a polars DataFrame."""
|
|
97
|
+
# If the arrow table has empty names, from_arrow produces
|
|
98
|
+
# column_$i. But here we know there is only one such column
|
|
99
|
+
# (by construction) and it should have an empty name.
|
|
100
|
+
# https://github.com/pola-rs/polars/issues/11632
|
|
101
|
+
# To guarantee we produce correct names, we therefore
|
|
102
|
+
# serialise with names we control and rename with that map.
|
|
103
|
+
name_map = {f"column_{i}": name for i, name in enumerate(self.column_map)}
|
|
104
|
+
metadata = [
|
|
105
|
+
_create_polars_column_metadata(name, dtype.polars)
|
|
106
|
+
for name, dtype in zip(name_map, self.dtypes, strict=True)
|
|
107
|
+
]
|
|
108
|
+
table_with_metadata = _ObjectWithArrowMetadata(self.table, metadata)
|
|
109
|
+
df = pl.DataFrame(table_with_metadata)
|
|
110
|
+
return df.rename(name_map).with_columns(
|
|
111
|
+
pl.col(c.name).set_sorted(descending=c.order == plc.types.Order.DESCENDING)
|
|
112
|
+
if c.is_sorted
|
|
113
|
+
else pl.col(c.name)
|
|
114
|
+
for c in self.columns
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
@cached_property
|
|
118
|
+
def column_names_set(self) -> frozenset[str]:
|
|
119
|
+
"""Return the column names as a set."""
|
|
120
|
+
return frozenset(self.column_map)
|
|
121
|
+
|
|
122
|
+
@cached_property
|
|
123
|
+
def column_names(self) -> list[str]:
|
|
124
|
+
"""Return a list of the column names."""
|
|
125
|
+
return list(self.column_map)
|
|
126
|
+
|
|
127
|
+
@cached_property
|
|
128
|
+
def num_columns(self) -> int:
|
|
129
|
+
"""Number of columns."""
|
|
130
|
+
return len(self.column_map)
|
|
131
|
+
|
|
132
|
+
@cached_property
|
|
133
|
+
def num_rows(self) -> int:
|
|
134
|
+
"""Number of rows."""
|
|
135
|
+
return self.table.num_rows() if self.column_map else 0
|
|
136
|
+
|
|
137
|
+
@classmethod
|
|
138
|
+
def from_polars(cls, df: pl.DataFrame) -> Self:
|
|
139
|
+
"""
|
|
140
|
+
Create from a polars dataframe.
|
|
141
|
+
|
|
142
|
+
Parameters
|
|
143
|
+
----------
|
|
144
|
+
df
|
|
145
|
+
Polars dataframe to convert
|
|
146
|
+
|
|
147
|
+
Returns
|
|
148
|
+
-------
|
|
149
|
+
New dataframe representing the input.
|
|
150
|
+
"""
|
|
151
|
+
plc_table = plc.Table.from_arrow(df)
|
|
152
|
+
return cls(
|
|
153
|
+
Column(d_col, name=name, dtype=DataType(h_col.dtype)).copy_metadata(h_col)
|
|
154
|
+
for d_col, h_col, name in zip(
|
|
155
|
+
plc_table.columns(), df.iter_columns(), df.columns, strict=True
|
|
156
|
+
)
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
@classmethod
|
|
160
|
+
def from_table(
|
|
161
|
+
cls, table: plc.Table, names: Sequence[str], dtypes: Sequence[DataType]
|
|
162
|
+
) -> Self:
|
|
163
|
+
"""
|
|
164
|
+
Create from a pylibcudf table.
|
|
165
|
+
|
|
166
|
+
Parameters
|
|
167
|
+
----------
|
|
168
|
+
table
|
|
169
|
+
Pylibcudf table to obtain columns from
|
|
170
|
+
names
|
|
171
|
+
Names for the columns
|
|
172
|
+
dtypes
|
|
173
|
+
Dtypes for the columns
|
|
174
|
+
|
|
175
|
+
Returns
|
|
176
|
+
-------
|
|
177
|
+
New dataframe sharing data with the input table.
|
|
178
|
+
|
|
179
|
+
Raises
|
|
180
|
+
------
|
|
181
|
+
ValueError
|
|
182
|
+
If the number of provided names does not match the
|
|
183
|
+
number of columns in the table.
|
|
184
|
+
"""
|
|
185
|
+
if table.num_columns() != len(names):
|
|
186
|
+
raise ValueError("Mismatching name and table length.")
|
|
187
|
+
return cls(
|
|
188
|
+
Column(c, name=name, dtype=dtype)
|
|
189
|
+
for c, name, dtype in zip(table.columns(), names, dtypes, strict=True)
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
@classmethod
|
|
193
|
+
def deserialize(
|
|
194
|
+
cls, header: DataFrameHeader, frames: tuple[memoryview, plc.gpumemoryview]
|
|
195
|
+
) -> Self:
|
|
196
|
+
"""
|
|
197
|
+
Create a DataFrame from a serialized representation returned by `.serialize()`.
|
|
198
|
+
|
|
199
|
+
Parameters
|
|
200
|
+
----------
|
|
201
|
+
header
|
|
202
|
+
The (unpickled) metadata required to reconstruct the object.
|
|
203
|
+
frames
|
|
204
|
+
Two-tuple of frames (a memoryview and a gpumemoryview).
|
|
205
|
+
|
|
206
|
+
Returns
|
|
207
|
+
-------
|
|
208
|
+
DataFrame
|
|
209
|
+
The deserialized DataFrame.
|
|
210
|
+
"""
|
|
211
|
+
packed_metadata, packed_gpu_data = frames
|
|
212
|
+
table = plc.contiguous_split.unpack_from_memoryviews(
|
|
213
|
+
packed_metadata, packed_gpu_data
|
|
214
|
+
)
|
|
215
|
+
return cls(
|
|
216
|
+
Column(c, **Column.deserialize_ctor_kwargs(kw))
|
|
217
|
+
for c, kw in zip(table.columns(), header["columns_kwargs"], strict=True)
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
def serialize(
|
|
221
|
+
self,
|
|
222
|
+
) -> tuple[DataFrameHeader, tuple[memoryview, plc.gpumemoryview]]:
|
|
223
|
+
"""
|
|
224
|
+
Serialize the table into header and frames.
|
|
225
|
+
|
|
226
|
+
Follows the Dask serialization scheme with a picklable header (dict) and
|
|
227
|
+
a tuple of frames (in this case a contiguous host and device buffer).
|
|
228
|
+
|
|
229
|
+
To enable dask support, dask serializers must be registered
|
|
230
|
+
|
|
231
|
+
>>> from cudf_polars.experimental.dask_serialize import register
|
|
232
|
+
>>> register()
|
|
233
|
+
|
|
234
|
+
Returns
|
|
235
|
+
-------
|
|
236
|
+
header
|
|
237
|
+
A dict containing any picklable metadata required to reconstruct the object.
|
|
238
|
+
frames
|
|
239
|
+
Two-tuple of frames suitable for passing to `plc.contiguous_split.unpack_from_memoryviews`
|
|
240
|
+
"""
|
|
241
|
+
packed = plc.contiguous_split.pack(self.table)
|
|
242
|
+
|
|
243
|
+
# Keyword arguments for `Column.__init__`.
|
|
244
|
+
columns_kwargs: list[ColumnOptions] = [
|
|
245
|
+
col.serialize_ctor_kwargs() for col in self.columns
|
|
246
|
+
]
|
|
247
|
+
header: DataFrameHeader = {
|
|
248
|
+
"columns_kwargs": columns_kwargs,
|
|
249
|
+
"frame_count": 2,
|
|
250
|
+
}
|
|
251
|
+
return header, packed.release()
|
|
252
|
+
|
|
253
|
+
def sorted_like(
|
|
254
|
+
self, like: DataFrame, /, *, subset: Set[str] | None = None
|
|
255
|
+
) -> Self:
|
|
256
|
+
"""
|
|
257
|
+
Return a shallow copy with sortedness copied from like.
|
|
258
|
+
|
|
259
|
+
Parameters
|
|
260
|
+
----------
|
|
261
|
+
like
|
|
262
|
+
The dataframe to copy from
|
|
263
|
+
subset
|
|
264
|
+
Optional subset of columns from which to copy data.
|
|
265
|
+
|
|
266
|
+
Returns
|
|
267
|
+
-------
|
|
268
|
+
Shallow copy of self with metadata set.
|
|
269
|
+
|
|
270
|
+
Raises
|
|
271
|
+
------
|
|
272
|
+
ValueError
|
|
273
|
+
If there is a name mismatch between self and like.
|
|
274
|
+
"""
|
|
275
|
+
if like.column_names != self.column_names:
|
|
276
|
+
raise ValueError("Can only copy from identically named frame")
|
|
277
|
+
subset = self.column_names_set if subset is None else subset
|
|
278
|
+
return type(self)(
|
|
279
|
+
c.sorted_like(other) if c.name in subset else c
|
|
280
|
+
for c, other in zip(self.columns, like.columns, strict=True)
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
def with_columns(
|
|
284
|
+
self, columns: Iterable[Column], *, replace_only: bool = False
|
|
285
|
+
) -> Self:
|
|
286
|
+
"""
|
|
287
|
+
Return a new dataframe with extra columns.
|
|
288
|
+
|
|
289
|
+
Parameters
|
|
290
|
+
----------
|
|
291
|
+
columns
|
|
292
|
+
Columns to add
|
|
293
|
+
replace_only
|
|
294
|
+
If true, then only replacements are allowed (matching by name).
|
|
295
|
+
|
|
296
|
+
Returns
|
|
297
|
+
-------
|
|
298
|
+
New dataframe
|
|
299
|
+
|
|
300
|
+
Notes
|
|
301
|
+
-----
|
|
302
|
+
If column names overlap, newer names replace older ones, and
|
|
303
|
+
appear in the same order as the original frame.
|
|
304
|
+
"""
|
|
305
|
+
new = {c.name: c for c in columns}
|
|
306
|
+
if replace_only and not self.column_names_set.issuperset(new.keys()):
|
|
307
|
+
raise ValueError("Cannot replace with non-existing names")
|
|
308
|
+
return type(self)((self.column_map | new).values())
|
|
309
|
+
|
|
310
|
+
def discard_columns(self, names: Set[str]) -> Self:
|
|
311
|
+
"""Drop columns by name."""
|
|
312
|
+
return type(self)(column for column in self.columns if column.name not in names)
|
|
313
|
+
|
|
314
|
+
def select(self, names: Sequence[str] | Mapping[str, Any]) -> Self:
|
|
315
|
+
"""Select columns by name returning DataFrame."""
|
|
316
|
+
try:
|
|
317
|
+
return type(self)(self.column_map[name] for name in names)
|
|
318
|
+
except KeyError as e:
|
|
319
|
+
raise ValueError("Can't select missing names") from e
|
|
320
|
+
|
|
321
|
+
def rename_columns(self, mapping: Mapping[str, str]) -> Self:
|
|
322
|
+
"""Rename some columns."""
|
|
323
|
+
return type(self)(c.rename(mapping.get(c.name, c.name)) for c in self.columns)
|
|
324
|
+
|
|
325
|
+
def select_columns(self, names: Set[str]) -> list[Column]:
|
|
326
|
+
"""Select columns by name."""
|
|
327
|
+
return [c for c in self.columns if c.name in names]
|
|
328
|
+
|
|
329
|
+
def filter(self, mask: Column) -> Self:
|
|
330
|
+
"""Return a filtered table given a mask."""
|
|
331
|
+
table = plc.stream_compaction.apply_boolean_mask(self.table, mask.obj)
|
|
332
|
+
return (
|
|
333
|
+
type(self)
|
|
334
|
+
.from_table(table, self.column_names, self.dtypes)
|
|
335
|
+
.sorted_like(self)
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
def slice(self, zlice: Slice | None) -> Self:
|
|
339
|
+
"""
|
|
340
|
+
Slice a dataframe.
|
|
341
|
+
|
|
342
|
+
Parameters
|
|
343
|
+
----------
|
|
344
|
+
zlice
|
|
345
|
+
optional, tuple of start and length, negative values of start
|
|
346
|
+
treated as for python indexing. If not provided, returns self.
|
|
347
|
+
|
|
348
|
+
Returns
|
|
349
|
+
-------
|
|
350
|
+
New dataframe (if zlice is not None) otherwise self (if it is)
|
|
351
|
+
"""
|
|
352
|
+
if zlice is None:
|
|
353
|
+
return self
|
|
354
|
+
(table,) = plc.copying.slice(
|
|
355
|
+
self.table, conversion.from_polars_slice(zlice, num_rows=self.num_rows)
|
|
356
|
+
)
|
|
357
|
+
return (
|
|
358
|
+
type(self)
|
|
359
|
+
.from_table(table, self.column_names, self.dtypes)
|
|
360
|
+
.sorted_like(self)
|
|
361
|
+
)
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
"""A datatype, preserving polars metadata."""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from functools import cache
|
|
9
|
+
|
|
10
|
+
from typing_extensions import assert_never
|
|
11
|
+
|
|
12
|
+
import polars as pl
|
|
13
|
+
|
|
14
|
+
import pylibcudf as plc
|
|
15
|
+
|
|
16
|
+
__all__ = ["DataType"]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@cache
|
|
20
|
+
def _from_polars(dtype: pl.DataType) -> plc.DataType:
|
|
21
|
+
"""
|
|
22
|
+
Convert a polars datatype to a pylibcudf one.
|
|
23
|
+
|
|
24
|
+
Parameters
|
|
25
|
+
----------
|
|
26
|
+
dtype
|
|
27
|
+
Polars dtype to convert
|
|
28
|
+
|
|
29
|
+
Returns
|
|
30
|
+
-------
|
|
31
|
+
Matching pylibcudf DataType object.
|
|
32
|
+
|
|
33
|
+
Raises
|
|
34
|
+
------
|
|
35
|
+
NotImplementedError
|
|
36
|
+
For unsupported conversions.
|
|
37
|
+
"""
|
|
38
|
+
if isinstance(dtype, pl.Boolean):
|
|
39
|
+
return plc.DataType(plc.TypeId.BOOL8)
|
|
40
|
+
elif isinstance(dtype, pl.Int8):
|
|
41
|
+
return plc.DataType(plc.TypeId.INT8)
|
|
42
|
+
elif isinstance(dtype, pl.Int16):
|
|
43
|
+
return plc.DataType(plc.TypeId.INT16)
|
|
44
|
+
elif isinstance(dtype, pl.Int32):
|
|
45
|
+
return plc.DataType(plc.TypeId.INT32)
|
|
46
|
+
elif isinstance(dtype, pl.Int64):
|
|
47
|
+
return plc.DataType(plc.TypeId.INT64)
|
|
48
|
+
if isinstance(dtype, pl.UInt8):
|
|
49
|
+
return plc.DataType(plc.TypeId.UINT8)
|
|
50
|
+
elif isinstance(dtype, pl.UInt16):
|
|
51
|
+
return plc.DataType(plc.TypeId.UINT16)
|
|
52
|
+
elif isinstance(dtype, pl.UInt32):
|
|
53
|
+
return plc.DataType(plc.TypeId.UINT32)
|
|
54
|
+
elif isinstance(dtype, pl.UInt64):
|
|
55
|
+
return plc.DataType(plc.TypeId.UINT64)
|
|
56
|
+
elif isinstance(dtype, pl.Float32):
|
|
57
|
+
return plc.DataType(plc.TypeId.FLOAT32)
|
|
58
|
+
elif isinstance(dtype, pl.Float64):
|
|
59
|
+
return plc.DataType(plc.TypeId.FLOAT64)
|
|
60
|
+
elif isinstance(dtype, pl.Date):
|
|
61
|
+
return plc.DataType(plc.TypeId.TIMESTAMP_DAYS)
|
|
62
|
+
elif isinstance(dtype, pl.Time):
|
|
63
|
+
raise NotImplementedError("Time of day dtype not implemented")
|
|
64
|
+
elif isinstance(dtype, pl.Datetime):
|
|
65
|
+
if dtype.time_unit == "ms":
|
|
66
|
+
return plc.DataType(plc.TypeId.TIMESTAMP_MILLISECONDS)
|
|
67
|
+
elif dtype.time_unit == "us":
|
|
68
|
+
return plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS)
|
|
69
|
+
elif dtype.time_unit == "ns":
|
|
70
|
+
return plc.DataType(plc.TypeId.TIMESTAMP_NANOSECONDS)
|
|
71
|
+
assert dtype.time_unit is not None # pragma: no cover
|
|
72
|
+
assert_never(dtype.time_unit)
|
|
73
|
+
elif isinstance(dtype, pl.Duration):
|
|
74
|
+
if dtype.time_unit == "ms":
|
|
75
|
+
return plc.DataType(plc.TypeId.DURATION_MILLISECONDS)
|
|
76
|
+
elif dtype.time_unit == "us":
|
|
77
|
+
return plc.DataType(plc.TypeId.DURATION_MICROSECONDS)
|
|
78
|
+
elif dtype.time_unit == "ns":
|
|
79
|
+
return plc.DataType(plc.TypeId.DURATION_NANOSECONDS)
|
|
80
|
+
assert dtype.time_unit is not None # pragma: no cover
|
|
81
|
+
assert_never(dtype.time_unit)
|
|
82
|
+
elif isinstance(dtype, pl.String):
|
|
83
|
+
return plc.DataType(plc.TypeId.STRING)
|
|
84
|
+
elif isinstance(dtype, pl.Decimal):
|
|
85
|
+
return plc.DataType(plc.TypeId.DECIMAL128, scale=-dtype.scale)
|
|
86
|
+
elif isinstance(dtype, pl.Null):
|
|
87
|
+
# TODO: Hopefully
|
|
88
|
+
return plc.DataType(plc.TypeId.EMPTY)
|
|
89
|
+
elif isinstance(dtype, pl.List):
|
|
90
|
+
# Recurse to catch unsupported inner types
|
|
91
|
+
_ = _from_polars(dtype.inner)
|
|
92
|
+
return plc.DataType(plc.TypeId.LIST)
|
|
93
|
+
elif isinstance(dtype, pl.Struct):
|
|
94
|
+
# Recurse to catch unsupported field types
|
|
95
|
+
for field in dtype.fields:
|
|
96
|
+
_ = _from_polars(field.dtype)
|
|
97
|
+
return plc.DataType(plc.TypeId.STRUCT)
|
|
98
|
+
else:
|
|
99
|
+
raise NotImplementedError(f"{dtype=} conversion not supported")
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class DataType:
|
|
103
|
+
"""A datatype, preserving polars metadata."""
|
|
104
|
+
|
|
105
|
+
polars: pl.datatypes.DataType
|
|
106
|
+
plc: plc.DataType
|
|
107
|
+
|
|
108
|
+
def __init__(self, polars_dtype: pl.DataType) -> None:
|
|
109
|
+
self.polars = polars_dtype
|
|
110
|
+
self.plc = _from_polars(polars_dtype)
|
|
111
|
+
|
|
112
|
+
def id(self) -> plc.TypeId:
|
|
113
|
+
"""The pylibcudf.TypeId of this DataType."""
|
|
114
|
+
return self.plc.id()
|
|
115
|
+
|
|
116
|
+
@property
|
|
117
|
+
def children(self) -> list[DataType]:
|
|
118
|
+
"""The children types of this DataType."""
|
|
119
|
+
if self.plc.id() == plc.TypeId.STRUCT:
|
|
120
|
+
return [DataType(field.dtype) for field in self.polars.fields]
|
|
121
|
+
elif self.plc.id() == plc.TypeId.LIST:
|
|
122
|
+
return [DataType(self.polars.inner)]
|
|
123
|
+
return []
|
|
124
|
+
|
|
125
|
+
def __eq__(self, other: object) -> bool:
|
|
126
|
+
"""Equality of DataTypes."""
|
|
127
|
+
if not isinstance(other, DataType):
|
|
128
|
+
return False
|
|
129
|
+
return self.polars == other.polars
|
|
130
|
+
|
|
131
|
+
def __hash__(self) -> int:
|
|
132
|
+
"""Hash of the DataType."""
|
|
133
|
+
return hash(self.polars)
|
|
134
|
+
|
|
135
|
+
def __repr__(self) -> str:
|
|
136
|
+
"""Representation of the DataType."""
|
|
137
|
+
return f"<DataType(polars={self.polars}, plc={self.id()!r})>"
|
cudf_polars/dsl/expr.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
# TODO: remove need for this
|
|
4
|
+
# ruff: noqa: D101
|
|
5
|
+
"""
|
|
6
|
+
DSL nodes for the polars expression language.
|
|
7
|
+
|
|
8
|
+
An expression node is a function, `DataFrame -> Column`.
|
|
9
|
+
|
|
10
|
+
The evaluation context is provided by a LogicalPlan node, and can
|
|
11
|
+
affect the evaluation rule as well as providing the dataframe input.
|
|
12
|
+
In particular, the interpretation of the expression language in a
|
|
13
|
+
`GroupBy` node is groupwise, rather than whole frame.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
from cudf_polars.dsl.expressions.aggregation import Agg
|
|
19
|
+
from cudf_polars.dsl.expressions.base import (
|
|
20
|
+
AggInfo,
|
|
21
|
+
Col,
|
|
22
|
+
ColRef,
|
|
23
|
+
ErrorExpr,
|
|
24
|
+
Expr,
|
|
25
|
+
NamedExpr,
|
|
26
|
+
)
|
|
27
|
+
from cudf_polars.dsl.expressions.binaryop import BinOp
|
|
28
|
+
from cudf_polars.dsl.expressions.boolean import BooleanFunction
|
|
29
|
+
from cudf_polars.dsl.expressions.datetime import TemporalFunction
|
|
30
|
+
from cudf_polars.dsl.expressions.literal import Literal, LiteralColumn
|
|
31
|
+
from cudf_polars.dsl.expressions.rolling import GroupedRollingWindow, RollingWindow
|
|
32
|
+
from cudf_polars.dsl.expressions.selection import Filter, Gather
|
|
33
|
+
from cudf_polars.dsl.expressions.slicing import Slice
|
|
34
|
+
from cudf_polars.dsl.expressions.sorting import Sort, SortBy
|
|
35
|
+
from cudf_polars.dsl.expressions.string import StringFunction
|
|
36
|
+
from cudf_polars.dsl.expressions.struct import StructFunction
|
|
37
|
+
from cudf_polars.dsl.expressions.ternary import Ternary
|
|
38
|
+
from cudf_polars.dsl.expressions.unary import Cast, Len, UnaryFunction
|
|
39
|
+
|
|
40
|
+
__all__ = [
|
|
41
|
+
"Agg",
|
|
42
|
+
"AggInfo",
|
|
43
|
+
"BinOp",
|
|
44
|
+
"BooleanFunction",
|
|
45
|
+
"Cast",
|
|
46
|
+
"Col",
|
|
47
|
+
"ColRef",
|
|
48
|
+
"ErrorExpr",
|
|
49
|
+
"Expr",
|
|
50
|
+
"Filter",
|
|
51
|
+
"Gather",
|
|
52
|
+
"GroupedRollingWindow",
|
|
53
|
+
"Len",
|
|
54
|
+
"Literal",
|
|
55
|
+
"LiteralColumn",
|
|
56
|
+
"NamedExpr",
|
|
57
|
+
"RollingWindow",
|
|
58
|
+
"Slice",
|
|
59
|
+
"Sort",
|
|
60
|
+
"SortBy",
|
|
61
|
+
"StringFunction",
|
|
62
|
+
"StructFunction",
|
|
63
|
+
"TemporalFunction",
|
|
64
|
+
"Ternary",
|
|
65
|
+
"UnaryFunction",
|
|
66
|
+
]
|