cudf-polars-cu12 24.8.0a281__py3-none-any.whl → 25.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudf_polars/VERSION +1 -1
- cudf_polars/__init__.py +9 -3
- cudf_polars/callback.py +258 -23
- cudf_polars/containers/__init__.py +2 -2
- cudf_polars/containers/column.py +167 -66
- cudf_polars/containers/dataframe.py +157 -58
- cudf_polars/dsl/expr.py +37 -1397
- cudf_polars/dsl/expressions/__init__.py +8 -0
- cudf_polars/dsl/expressions/aggregation.py +246 -0
- cudf_polars/dsl/expressions/base.py +300 -0
- cudf_polars/dsl/expressions/binaryop.py +135 -0
- cudf_polars/dsl/expressions/boolean.py +312 -0
- cudf_polars/dsl/expressions/datetime.py +196 -0
- cudf_polars/dsl/expressions/literal.py +91 -0
- cudf_polars/dsl/expressions/rolling.py +40 -0
- cudf_polars/dsl/expressions/selection.py +92 -0
- cudf_polars/dsl/expressions/sorting.py +97 -0
- cudf_polars/dsl/expressions/string.py +362 -0
- cudf_polars/dsl/expressions/ternary.py +53 -0
- cudf_polars/dsl/expressions/unary.py +339 -0
- cudf_polars/dsl/ir.py +1202 -427
- cudf_polars/dsl/nodebase.py +150 -0
- cudf_polars/dsl/to_ast.py +318 -0
- cudf_polars/dsl/translate.py +398 -181
- cudf_polars/dsl/traversal.py +175 -0
- cudf_polars/experimental/__init__.py +8 -0
- cudf_polars/experimental/base.py +43 -0
- cudf_polars/experimental/dask_serialize.py +59 -0
- cudf_polars/experimental/dispatch.py +84 -0
- cudf_polars/experimental/io.py +325 -0
- cudf_polars/experimental/parallel.py +253 -0
- cudf_polars/experimental/select.py +36 -0
- cudf_polars/testing/asserts.py +139 -19
- cudf_polars/testing/plugin.py +242 -0
- cudf_polars/typing/__init__.py +51 -10
- cudf_polars/utils/dtypes.py +88 -39
- cudf_polars/utils/sorting.py +2 -2
- cudf_polars/utils/versions.py +22 -0
- {cudf_polars_cu12-24.8.0a281.dist-info → cudf_polars_cu12-25.2.0.dist-info}/METADATA +15 -12
- cudf_polars_cu12-25.2.0.dist-info/RECORD +48 -0
- {cudf_polars_cu12-24.8.0a281.dist-info → cudf_polars_cu12-25.2.0.dist-info}/WHEEL +1 -1
- cudf_polars_cu12-24.8.0a281.dist-info/RECORD +0 -23
- {cudf_polars_cu12-24.8.0a281.dist-info → cudf_polars_cu12-25.2.0.dist-info}/LICENSE +0 -0
- {cudf_polars_cu12-24.8.0a281.dist-info → cudf_polars_cu12-25.2.0.dist-info}/top_level.txt +0 -0
|
@@ -5,82 +5,119 @@
|
|
|
5
5
|
|
|
6
6
|
from __future__ import annotations
|
|
7
7
|
|
|
8
|
-
import
|
|
8
|
+
import pickle
|
|
9
9
|
from functools import cached_property
|
|
10
|
-
from typing import TYPE_CHECKING, cast
|
|
10
|
+
from typing import TYPE_CHECKING, Any, cast
|
|
11
|
+
|
|
12
|
+
import pyarrow as pa
|
|
11
13
|
|
|
12
14
|
import polars as pl
|
|
13
15
|
|
|
14
|
-
import
|
|
16
|
+
import pylibcudf as plc
|
|
15
17
|
|
|
16
|
-
from cudf_polars.containers
|
|
18
|
+
from cudf_polars.containers import Column
|
|
19
|
+
from cudf_polars.utils import dtypes
|
|
17
20
|
|
|
18
21
|
if TYPE_CHECKING:
|
|
19
|
-
from collections.abc import Mapping, Sequence, Set
|
|
22
|
+
from collections.abc import Iterable, Mapping, Sequence, Set
|
|
20
23
|
|
|
21
|
-
import pyarrow as pa
|
|
22
24
|
from typing_extensions import Self
|
|
23
25
|
|
|
24
|
-
import cudf
|
|
25
26
|
|
|
26
|
-
|
|
27
|
+
__all__: list[str] = ["DataFrame"]
|
|
27
28
|
|
|
28
29
|
|
|
29
|
-
|
|
30
|
+
# Pacify the type checker. DataFrame init asserts that all the columns
|
|
31
|
+
# have a string name, so let's narrow the type.
|
|
32
|
+
class NamedColumn(Column):
|
|
33
|
+
name: str
|
|
30
34
|
|
|
31
35
|
|
|
32
36
|
class DataFrame:
|
|
33
37
|
"""A representation of a dataframe."""
|
|
34
38
|
|
|
35
|
-
|
|
39
|
+
column_map: dict[str, Column]
|
|
36
40
|
table: plc.Table
|
|
41
|
+
columns: list[NamedColumn]
|
|
37
42
|
|
|
38
|
-
def __init__(self, columns:
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
43
|
+
def __init__(self, columns: Iterable[Column]) -> None:
|
|
44
|
+
columns = list(columns)
|
|
45
|
+
if any(c.name is None for c in columns):
|
|
46
|
+
raise ValueError("All columns must have a name")
|
|
47
|
+
self.columns = [cast(NamedColumn, c) for c in columns]
|
|
48
|
+
self.column_map = {c.name: c for c in self.columns}
|
|
49
|
+
self.table = plc.Table([c.obj for c in self.columns])
|
|
42
50
|
|
|
43
51
|
def copy(self) -> Self:
|
|
44
52
|
"""Return a shallow copy of self."""
|
|
45
|
-
return type(self)(
|
|
53
|
+
return type(self)(c.copy() for c in self.columns)
|
|
46
54
|
|
|
47
55
|
def to_polars(self) -> pl.DataFrame:
|
|
48
56
|
"""Convert to a polars DataFrame."""
|
|
49
|
-
table
|
|
57
|
+
# If the arrow table has empty names, from_arrow produces
|
|
58
|
+
# column_$i. But here we know there is only one such column
|
|
59
|
+
# (by construction) and it should have an empty name.
|
|
60
|
+
# https://github.com/pola-rs/polars/issues/11632
|
|
61
|
+
# To guarantee we produce correct names, we therefore
|
|
62
|
+
# serialise with names we control and rename with that map.
|
|
63
|
+
name_map = {f"column_{i}": name for i, name in enumerate(self.column_map)}
|
|
64
|
+
table = plc.interop.to_arrow(
|
|
50
65
|
self.table,
|
|
51
|
-
[plc.interop.ColumnMetadata(name=
|
|
66
|
+
[plc.interop.ColumnMetadata(name=name) for name in name_map],
|
|
67
|
+
)
|
|
68
|
+
df: pl.DataFrame = pl.from_arrow(table)
|
|
69
|
+
return df.rename(name_map).with_columns(
|
|
70
|
+
pl.col(c.name).set_sorted(descending=c.order == plc.types.Order.DESCENDING)
|
|
71
|
+
if c.is_sorted
|
|
72
|
+
else pl.col(c.name)
|
|
73
|
+
for c in self.columns
|
|
52
74
|
)
|
|
53
|
-
|
|
54
|
-
return cast(pl.DataFrame, pl.from_arrow(table))
|
|
55
75
|
|
|
56
76
|
@cached_property
|
|
57
77
|
def column_names_set(self) -> frozenset[str]:
|
|
58
78
|
"""Return the column names as a set."""
|
|
59
|
-
return frozenset(
|
|
79
|
+
return frozenset(self.column_map)
|
|
60
80
|
|
|
61
81
|
@cached_property
|
|
62
82
|
def column_names(self) -> list[str]:
|
|
63
83
|
"""Return a list of the column names."""
|
|
64
|
-
return
|
|
84
|
+
return list(self.column_map)
|
|
65
85
|
|
|
66
86
|
@cached_property
|
|
67
87
|
def num_columns(self) -> int:
|
|
68
88
|
"""Number of columns."""
|
|
69
|
-
return len(self.
|
|
89
|
+
return len(self.column_map)
|
|
70
90
|
|
|
71
91
|
@cached_property
|
|
72
92
|
def num_rows(self) -> int:
|
|
73
93
|
"""Number of rows."""
|
|
74
|
-
return
|
|
94
|
+
return self.table.num_rows() if self.column_map else 0
|
|
75
95
|
|
|
76
96
|
@classmethod
|
|
77
|
-
def
|
|
78
|
-
"""
|
|
97
|
+
def from_polars(cls, df: pl.DataFrame) -> Self:
|
|
98
|
+
"""
|
|
99
|
+
Create from a polars dataframe.
|
|
100
|
+
|
|
101
|
+
Parameters
|
|
102
|
+
----------
|
|
103
|
+
df
|
|
104
|
+
Polars dataframe to convert
|
|
105
|
+
|
|
106
|
+
Returns
|
|
107
|
+
-------
|
|
108
|
+
New dataframe representing the input.
|
|
109
|
+
"""
|
|
110
|
+
table = df.to_arrow()
|
|
111
|
+
schema = table.schema
|
|
112
|
+
for i, field in enumerate(schema):
|
|
113
|
+
schema = schema.set(
|
|
114
|
+
i, pa.field(field.name, dtypes.downcast_arrow_lists(field.type))
|
|
115
|
+
)
|
|
116
|
+
# No-op if the schema is unchanged.
|
|
117
|
+
d_table = plc.interop.from_arrow(table.cast(schema))
|
|
79
118
|
return cls(
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
for name, c in df._data.items()
|
|
83
|
-
]
|
|
119
|
+
Column(column).copy_metadata(h_col)
|
|
120
|
+
for column, h_col in zip(d_table.columns(), df.iter_columns(), strict=True)
|
|
84
121
|
)
|
|
85
122
|
|
|
86
123
|
@classmethod
|
|
@@ -108,15 +145,83 @@ class DataFrame:
|
|
|
108
145
|
if table.num_columns() != len(names):
|
|
109
146
|
raise ValueError("Mismatching name and table length.")
|
|
110
147
|
return cls(
|
|
111
|
-
|
|
112
|
-
[NamedColumn(c, name) for c, name in zip(table.columns(), names)]
|
|
148
|
+
Column(c, name=name) for c, name in zip(table.columns(), names, strict=True)
|
|
113
149
|
)
|
|
114
150
|
|
|
151
|
+
@classmethod
|
|
152
|
+
def deserialize(
|
|
153
|
+
cls, header: Mapping[str, Any], frames: tuple[memoryview, plc.gpumemoryview]
|
|
154
|
+
) -> Self:
|
|
155
|
+
"""
|
|
156
|
+
Create a DataFrame from a serialized representation returned by `.serialize()`.
|
|
157
|
+
|
|
158
|
+
Parameters
|
|
159
|
+
----------
|
|
160
|
+
header
|
|
161
|
+
The (unpickled) metadata required to reconstruct the object.
|
|
162
|
+
frames
|
|
163
|
+
Two-tuple of frames (a memoryview and a gpumemoryview).
|
|
164
|
+
|
|
165
|
+
Returns
|
|
166
|
+
-------
|
|
167
|
+
DataFrame
|
|
168
|
+
The deserialized DataFrame.
|
|
169
|
+
"""
|
|
170
|
+
packed_metadata, packed_gpu_data = frames
|
|
171
|
+
table = plc.contiguous_split.unpack_from_memoryviews(
|
|
172
|
+
packed_metadata, packed_gpu_data
|
|
173
|
+
)
|
|
174
|
+
return cls(
|
|
175
|
+
Column(c, **kw)
|
|
176
|
+
for c, kw in zip(table.columns(), header["columns_kwargs"], strict=True)
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
def serialize(
|
|
180
|
+
self,
|
|
181
|
+
) -> tuple[Mapping[str, Any], tuple[memoryview, plc.gpumemoryview]]:
|
|
182
|
+
"""
|
|
183
|
+
Serialize the table into header and frames.
|
|
184
|
+
|
|
185
|
+
Follows the Dask serialization scheme with a picklable header (dict) and
|
|
186
|
+
a tuple of frames (in this case a contiguous host and device buffer).
|
|
187
|
+
|
|
188
|
+
To enable dask support, dask serializers must be registered
|
|
189
|
+
|
|
190
|
+
>>> from cudf_polars.experimental.dask_serialize import register
|
|
191
|
+
>>> register()
|
|
192
|
+
|
|
193
|
+
Returns
|
|
194
|
+
-------
|
|
195
|
+
header
|
|
196
|
+
A dict containing any picklable metadata required to reconstruct the object.
|
|
197
|
+
frames
|
|
198
|
+
Two-tuple of frames suitable for passing to `unpack_from_memoryviews`
|
|
199
|
+
"""
|
|
200
|
+
packed = plc.contiguous_split.pack(self.table)
|
|
201
|
+
|
|
202
|
+
# Keyword arguments for `Column.__init__`.
|
|
203
|
+
columns_kwargs = [
|
|
204
|
+
{
|
|
205
|
+
"is_sorted": col.is_sorted,
|
|
206
|
+
"order": col.order,
|
|
207
|
+
"null_order": col.null_order,
|
|
208
|
+
"name": col.name,
|
|
209
|
+
}
|
|
210
|
+
for col in self.columns
|
|
211
|
+
]
|
|
212
|
+
header = {
|
|
213
|
+
"columns_kwargs": columns_kwargs,
|
|
214
|
+
# Dask Distributed uses "type-serialized" to dispatch deserialization
|
|
215
|
+
"type-serialized": pickle.dumps(type(self)),
|
|
216
|
+
"frame_count": 2,
|
|
217
|
+
}
|
|
218
|
+
return header, packed.release()
|
|
219
|
+
|
|
115
220
|
def sorted_like(
|
|
116
221
|
self, like: DataFrame, /, *, subset: Set[str] | None = None
|
|
117
222
|
) -> Self:
|
|
118
223
|
"""
|
|
119
|
-
|
|
224
|
+
Return a shallow copy with sortedness copied from like.
|
|
120
225
|
|
|
121
226
|
Parameters
|
|
122
227
|
----------
|
|
@@ -127,7 +232,7 @@ class DataFrame:
|
|
|
127
232
|
|
|
128
233
|
Returns
|
|
129
234
|
-------
|
|
130
|
-
|
|
235
|
+
Shallow copy of self with metadata set.
|
|
131
236
|
|
|
132
237
|
Raises
|
|
133
238
|
------
|
|
@@ -137,14 +242,12 @@ class DataFrame:
|
|
|
137
242
|
if like.column_names != self.column_names:
|
|
138
243
|
raise ValueError("Can only copy from identically named frame")
|
|
139
244
|
subset = self.column_names_set if subset is None else subset
|
|
140
|
-
self
|
|
245
|
+
return type(self)(
|
|
141
246
|
c.sorted_like(other) if c.name in subset else c
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
]
|
|
145
|
-
return self
|
|
247
|
+
for c, other in zip(self.columns, like.columns, strict=True)
|
|
248
|
+
)
|
|
146
249
|
|
|
147
|
-
def with_columns(self, columns:
|
|
250
|
+
def with_columns(self, columns: Iterable[Column], *, replace_only=False) -> Self:
|
|
148
251
|
"""
|
|
149
252
|
Return a new dataframe with extra columns.
|
|
150
253
|
|
|
@@ -152,6 +255,8 @@ class DataFrame:
|
|
|
152
255
|
----------
|
|
153
256
|
columns
|
|
154
257
|
Columns to add
|
|
258
|
+
replace_only
|
|
259
|
+
If true, then only replacements are allowed (matching by name).
|
|
155
260
|
|
|
156
261
|
Returns
|
|
157
262
|
-------
|
|
@@ -159,36 +264,30 @@ class DataFrame:
|
|
|
159
264
|
|
|
160
265
|
Notes
|
|
161
266
|
-----
|
|
162
|
-
If column names overlap, newer names replace older ones
|
|
267
|
+
If column names overlap, newer names replace older ones, and
|
|
268
|
+
appear in the same order as the original frame.
|
|
163
269
|
"""
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
return type(self)(
|
|
270
|
+
new = {c.name: c for c in columns}
|
|
271
|
+
if replace_only and not self.column_names_set.issuperset(new.keys()):
|
|
272
|
+
raise ValueError("Cannot replace with non-existing names")
|
|
273
|
+
return type(self)((self.column_map | new).values())
|
|
168
274
|
|
|
169
275
|
def discard_columns(self, names: Set[str]) -> Self:
|
|
170
276
|
"""Drop columns by name."""
|
|
171
|
-
return type(self)(
|
|
277
|
+
return type(self)(column for column in self.columns if column.name not in names)
|
|
172
278
|
|
|
173
279
|
def select(self, names: Sequence[str]) -> Self:
|
|
174
280
|
"""Select columns by name returning DataFrame."""
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
def replace_columns(self, *columns: NamedColumn) -> Self:
|
|
181
|
-
"""Return a new dataframe with columns replaced by name."""
|
|
182
|
-
new = {c.name: c for c in columns}
|
|
183
|
-
if not set(new).issubset(self.column_names_set):
|
|
184
|
-
raise ValueError("Cannot replace with non-existing names")
|
|
185
|
-
return type(self)([new.get(c.name, c) for c in self.columns])
|
|
281
|
+
try:
|
|
282
|
+
return type(self)(self.column_map[name] for name in names)
|
|
283
|
+
except KeyError as e:
|
|
284
|
+
raise ValueError("Can't select missing names") from e
|
|
186
285
|
|
|
187
286
|
def rename_columns(self, mapping: Mapping[str, str]) -> Self:
|
|
188
287
|
"""Rename some columns."""
|
|
189
|
-
return type(self)(
|
|
288
|
+
return type(self)(c.rename(mapping.get(c.name, c.name)) for c in self.columns)
|
|
190
289
|
|
|
191
|
-
def select_columns(self, names: Set[str]) -> list[
|
|
290
|
+
def select_columns(self, names: Set[str]) -> list[Column]:
|
|
192
291
|
"""Select columns by name."""
|
|
193
292
|
return [c for c in self.columns if c.name in names]
|
|
194
293
|
|