cudf-polars-cu12 24.8.0a281__py3-none-any.whl → 25.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. cudf_polars/VERSION +1 -1
  2. cudf_polars/__init__.py +9 -3
  3. cudf_polars/callback.py +258 -23
  4. cudf_polars/containers/__init__.py +2 -2
  5. cudf_polars/containers/column.py +167 -66
  6. cudf_polars/containers/dataframe.py +157 -58
  7. cudf_polars/dsl/expr.py +37 -1397
  8. cudf_polars/dsl/expressions/__init__.py +8 -0
  9. cudf_polars/dsl/expressions/aggregation.py +246 -0
  10. cudf_polars/dsl/expressions/base.py +300 -0
  11. cudf_polars/dsl/expressions/binaryop.py +135 -0
  12. cudf_polars/dsl/expressions/boolean.py +312 -0
  13. cudf_polars/dsl/expressions/datetime.py +196 -0
  14. cudf_polars/dsl/expressions/literal.py +91 -0
  15. cudf_polars/dsl/expressions/rolling.py +40 -0
  16. cudf_polars/dsl/expressions/selection.py +92 -0
  17. cudf_polars/dsl/expressions/sorting.py +97 -0
  18. cudf_polars/dsl/expressions/string.py +362 -0
  19. cudf_polars/dsl/expressions/ternary.py +53 -0
  20. cudf_polars/dsl/expressions/unary.py +339 -0
  21. cudf_polars/dsl/ir.py +1202 -427
  22. cudf_polars/dsl/nodebase.py +150 -0
  23. cudf_polars/dsl/to_ast.py +318 -0
  24. cudf_polars/dsl/translate.py +398 -181
  25. cudf_polars/dsl/traversal.py +175 -0
  26. cudf_polars/experimental/__init__.py +8 -0
  27. cudf_polars/experimental/base.py +43 -0
  28. cudf_polars/experimental/dask_serialize.py +59 -0
  29. cudf_polars/experimental/dispatch.py +84 -0
  30. cudf_polars/experimental/io.py +325 -0
  31. cudf_polars/experimental/parallel.py +253 -0
  32. cudf_polars/experimental/select.py +36 -0
  33. cudf_polars/testing/asserts.py +139 -19
  34. cudf_polars/testing/plugin.py +242 -0
  35. cudf_polars/typing/__init__.py +51 -10
  36. cudf_polars/utils/dtypes.py +88 -39
  37. cudf_polars/utils/sorting.py +2 -2
  38. cudf_polars/utils/versions.py +22 -0
  39. {cudf_polars_cu12-24.8.0a281.dist-info → cudf_polars_cu12-25.2.0.dist-info}/METADATA +15 -12
  40. cudf_polars_cu12-25.2.0.dist-info/RECORD +48 -0
  41. {cudf_polars_cu12-24.8.0a281.dist-info → cudf_polars_cu12-25.2.0.dist-info}/WHEEL +1 -1
  42. cudf_polars_cu12-24.8.0a281.dist-info/RECORD +0 -23
  43. {cudf_polars_cu12-24.8.0a281.dist-info → cudf_polars_cu12-25.2.0.dist-info}/LICENSE +0 -0
  44. {cudf_polars_cu12-24.8.0a281.dist-info → cudf_polars_cu12-25.2.0.dist-info}/top_level.txt +0 -0
@@ -5,82 +5,119 @@
5
5
 
6
6
  from __future__ import annotations
7
7
 
8
- import itertools
8
+ import pickle
9
9
  from functools import cached_property
10
- from typing import TYPE_CHECKING, cast
10
+ from typing import TYPE_CHECKING, Any, cast
11
+
12
+ import pyarrow as pa
11
13
 
12
14
  import polars as pl
13
15
 
14
- import cudf._lib.pylibcudf as plc
16
+ import pylibcudf as plc
15
17
 
16
- from cudf_polars.containers.column import NamedColumn
18
+ from cudf_polars.containers import Column
19
+ from cudf_polars.utils import dtypes
17
20
 
18
21
  if TYPE_CHECKING:
19
- from collections.abc import Mapping, Sequence, Set
22
+ from collections.abc import Iterable, Mapping, Sequence, Set
20
23
 
21
- import pyarrow as pa
22
24
  from typing_extensions import Self
23
25
 
24
- import cudf
25
26
 
26
- from cudf_polars.containers import Column
27
+ __all__: list[str] = ["DataFrame"]
27
28
 
28
29
 
29
- __all__: list[str] = ["DataFrame"]
30
+ # Pacify the type checker. DataFrame init asserts that all the columns
31
+ # have a string name, so let's narrow the type.
32
+ class NamedColumn(Column):
33
+ name: str
30
34
 
31
35
 
32
36
  class DataFrame:
33
37
  """A representation of a dataframe."""
34
38
 
35
- columns: list[NamedColumn]
39
+ column_map: dict[str, Column]
36
40
  table: plc.Table
41
+ columns: list[NamedColumn]
37
42
 
38
- def __init__(self, columns: Sequence[NamedColumn]) -> None:
39
- self.columns = list(columns)
40
- self._column_map = {c.name: c for c in self.columns}
41
- self.table = plc.Table([c.obj for c in columns])
43
+ def __init__(self, columns: Iterable[Column]) -> None:
44
+ columns = list(columns)
45
+ if any(c.name is None for c in columns):
46
+ raise ValueError("All columns must have a name")
47
+ self.columns = [cast(NamedColumn, c) for c in columns]
48
+ self.column_map = {c.name: c for c in self.columns}
49
+ self.table = plc.Table([c.obj for c in self.columns])
42
50
 
43
51
  def copy(self) -> Self:
44
52
  """Return a shallow copy of self."""
45
- return type(self)([c.copy() for c in self.columns])
53
+ return type(self)(c.copy() for c in self.columns)
46
54
 
47
55
  def to_polars(self) -> pl.DataFrame:
48
56
  """Convert to a polars DataFrame."""
49
- table: pa.Table = plc.interop.to_arrow(
57
+ # If the arrow table has empty names, from_arrow produces
58
+ # column_$i. But here we know there is only one such column
59
+ # (by construction) and it should have an empty name.
60
+ # https://github.com/pola-rs/polars/issues/11632
61
+ # To guarantee we produce correct names, we therefore
62
+ # serialise with names we control and rename with that map.
63
+ name_map = {f"column_{i}": name for i, name in enumerate(self.column_map)}
64
+ table = plc.interop.to_arrow(
50
65
  self.table,
51
- [plc.interop.ColumnMetadata(name=c.name) for c in self.columns],
66
+ [plc.interop.ColumnMetadata(name=name) for name in name_map],
67
+ )
68
+ df: pl.DataFrame = pl.from_arrow(table)
69
+ return df.rename(name_map).with_columns(
70
+ pl.col(c.name).set_sorted(descending=c.order == plc.types.Order.DESCENDING)
71
+ if c.is_sorted
72
+ else pl.col(c.name)
73
+ for c in self.columns
52
74
  )
53
-
54
- return cast(pl.DataFrame, pl.from_arrow(table))
55
75
 
56
76
  @cached_property
57
77
  def column_names_set(self) -> frozenset[str]:
58
78
  """Return the column names as a set."""
59
- return frozenset(c.name for c in self.columns)
79
+ return frozenset(self.column_map)
60
80
 
61
81
  @cached_property
62
82
  def column_names(self) -> list[str]:
63
83
  """Return a list of the column names."""
64
- return [c.name for c in self.columns]
84
+ return list(self.column_map)
65
85
 
66
86
  @cached_property
67
87
  def num_columns(self) -> int:
68
88
  """Number of columns."""
69
- return len(self.columns)
89
+ return len(self.column_map)
70
90
 
71
91
  @cached_property
72
92
  def num_rows(self) -> int:
73
93
  """Number of rows."""
74
- return 0 if len(self.columns) == 0 else self.table.num_rows()
94
+ return self.table.num_rows() if self.column_map else 0
75
95
 
76
96
  @classmethod
77
- def from_cudf(cls, df: cudf.DataFrame) -> Self:
78
- """Create from a cudf dataframe."""
97
+ def from_polars(cls, df: pl.DataFrame) -> Self:
98
+ """
99
+ Create from a polars dataframe.
100
+
101
+ Parameters
102
+ ----------
103
+ df
104
+ Polars dataframe to convert
105
+
106
+ Returns
107
+ -------
108
+ New dataframe representing the input.
109
+ """
110
+ table = df.to_arrow()
111
+ schema = table.schema
112
+ for i, field in enumerate(schema):
113
+ schema = schema.set(
114
+ i, pa.field(field.name, dtypes.downcast_arrow_lists(field.type))
115
+ )
116
+ # No-op if the schema is unchanged.
117
+ d_table = plc.interop.from_arrow(table.cast(schema))
79
118
  return cls(
80
- [
81
- NamedColumn(c.to_pylibcudf(mode="read"), name)
82
- for name, c in df._data.items()
83
- ]
119
+ Column(column).copy_metadata(h_col)
120
+ for column, h_col in zip(d_table.columns(), df.iter_columns(), strict=True)
84
121
  )
85
122
 
86
123
  @classmethod
@@ -108,15 +145,83 @@ class DataFrame:
108
145
  if table.num_columns() != len(names):
109
146
  raise ValueError("Mismatching name and table length.")
110
147
  return cls(
111
- # TODO: strict=True when we drop py39
112
- [NamedColumn(c, name) for c, name in zip(table.columns(), names)]
148
+ Column(c, name=name) for c, name in zip(table.columns(), names, strict=True)
113
149
  )
114
150
 
151
+ @classmethod
152
+ def deserialize(
153
+ cls, header: Mapping[str, Any], frames: tuple[memoryview, plc.gpumemoryview]
154
+ ) -> Self:
155
+ """
156
+ Create a DataFrame from a serialized representation returned by `.serialize()`.
157
+
158
+ Parameters
159
+ ----------
160
+ header
161
+ The (unpickled) metadata required to reconstruct the object.
162
+ frames
163
+ Two-tuple of frames (a memoryview and a gpumemoryview).
164
+
165
+ Returns
166
+ -------
167
+ DataFrame
168
+ The deserialized DataFrame.
169
+ """
170
+ packed_metadata, packed_gpu_data = frames
171
+ table = plc.contiguous_split.unpack_from_memoryviews(
172
+ packed_metadata, packed_gpu_data
173
+ )
174
+ return cls(
175
+ Column(c, **kw)
176
+ for c, kw in zip(table.columns(), header["columns_kwargs"], strict=True)
177
+ )
178
+
179
+ def serialize(
180
+ self,
181
+ ) -> tuple[Mapping[str, Any], tuple[memoryview, plc.gpumemoryview]]:
182
+ """
183
+ Serialize the table into header and frames.
184
+
185
+ Follows the Dask serialization scheme with a picklable header (dict) and
186
+ a tuple of frames (in this case a contiguous host and device buffer).
187
+
188
+ To enable dask support, dask serializers must be registered
189
+
190
+ >>> from cudf_polars.experimental.dask_serialize import register
191
+ >>> register()
192
+
193
+ Returns
194
+ -------
195
+ header
196
+ A dict containing any picklable metadata required to reconstruct the object.
197
+ frames
198
+ Two-tuple of frames suitable for passing to `unpack_from_memoryviews`
199
+ """
200
+ packed = plc.contiguous_split.pack(self.table)
201
+
202
+ # Keyword arguments for `Column.__init__`.
203
+ columns_kwargs = [
204
+ {
205
+ "is_sorted": col.is_sorted,
206
+ "order": col.order,
207
+ "null_order": col.null_order,
208
+ "name": col.name,
209
+ }
210
+ for col in self.columns
211
+ ]
212
+ header = {
213
+ "columns_kwargs": columns_kwargs,
214
+ # Dask Distributed uses "type-serialized" to dispatch deserialization
215
+ "type-serialized": pickle.dumps(type(self)),
216
+ "frame_count": 2,
217
+ }
218
+ return header, packed.release()
219
+
115
220
  def sorted_like(
116
221
  self, like: DataFrame, /, *, subset: Set[str] | None = None
117
222
  ) -> Self:
118
223
  """
119
- Copy sortedness from a dataframe onto self.
224
+ Return a shallow copy with sortedness copied from like.
120
225
 
121
226
  Parameters
122
227
  ----------
@@ -127,7 +232,7 @@ class DataFrame:
127
232
 
128
233
  Returns
129
234
  -------
130
- Self with metadata set.
235
+ Shallow copy of self with metadata set.
131
236
 
132
237
  Raises
133
238
  ------
@@ -137,14 +242,12 @@ class DataFrame:
137
242
  if like.column_names != self.column_names:
138
243
  raise ValueError("Can only copy from identically named frame")
139
244
  subset = self.column_names_set if subset is None else subset
140
- self.columns = [
245
+ return type(self)(
141
246
  c.sorted_like(other) if c.name in subset else c
142
- # TODO: strict=True when we drop py39
143
- for c, other in zip(self.columns, like.columns)
144
- ]
145
- return self
247
+ for c, other in zip(self.columns, like.columns, strict=True)
248
+ )
146
249
 
147
- def with_columns(self, columns: Sequence[NamedColumn]) -> Self:
250
+ def with_columns(self, columns: Iterable[Column], *, replace_only=False) -> Self:
148
251
  """
149
252
  Return a new dataframe with extra columns.
150
253
 
@@ -152,6 +255,8 @@ class DataFrame:
152
255
  ----------
153
256
  columns
154
257
  Columns to add
258
+ replace_only
259
+ If true, then only replacements are allowed (matching by name).
155
260
 
156
261
  Returns
157
262
  -------
@@ -159,36 +264,30 @@ class DataFrame:
159
264
 
160
265
  Notes
161
266
  -----
162
- If column names overlap, newer names replace older ones.
267
+ If column names overlap, newer names replace older ones, and
268
+ appear in the same order as the original frame.
163
269
  """
164
- columns = list(
165
- {c.name: c for c in itertools.chain(self.columns, columns)}.values()
166
- )
167
- return type(self)(columns)
270
+ new = {c.name: c for c in columns}
271
+ if replace_only and not self.column_names_set.issuperset(new.keys()):
272
+ raise ValueError("Cannot replace with non-existing names")
273
+ return type(self)((self.column_map | new).values())
168
274
 
169
275
  def discard_columns(self, names: Set[str]) -> Self:
170
276
  """Drop columns by name."""
171
- return type(self)([c for c in self.columns if c.name not in names])
277
+ return type(self)(column for column in self.columns if column.name not in names)
172
278
 
173
279
  def select(self, names: Sequence[str]) -> Self:
174
280
  """Select columns by name returning DataFrame."""
175
- want = set(names)
176
- if not want.issubset(self.column_names_set):
177
- raise ValueError("Can't select missing names")
178
- return type(self)([self._column_map[name] for name in names])
179
-
180
- def replace_columns(self, *columns: NamedColumn) -> Self:
181
- """Return a new dataframe with columns replaced by name."""
182
- new = {c.name: c for c in columns}
183
- if not set(new).issubset(self.column_names_set):
184
- raise ValueError("Cannot replace with non-existing names")
185
- return type(self)([new.get(c.name, c) for c in self.columns])
281
+ try:
282
+ return type(self)(self.column_map[name] for name in names)
283
+ except KeyError as e:
284
+ raise ValueError("Can't select missing names") from e
186
285
 
187
286
  def rename_columns(self, mapping: Mapping[str, str]) -> Self:
188
287
  """Rename some columns."""
189
- return type(self)([c.copy(new_name=mapping.get(c.name)) for c in self.columns])
288
+ return type(self)(c.rename(mapping.get(c.name, c.name)) for c in self.columns)
190
289
 
191
- def select_columns(self, names: Set[str]) -> list[NamedColumn]:
290
+ def select_columns(self, names: Set[str]) -> list[Column]:
192
291
  """Select columns by name."""
193
292
  return [c for c in self.columns if c.name in names]
194
293