cudf-polars-cu12 25.2.2__py3-none-any.whl → 25.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. cudf_polars/VERSION +1 -1
  2. cudf_polars/callback.py +82 -65
  3. cudf_polars/containers/column.py +138 -7
  4. cudf_polars/containers/dataframe.py +26 -39
  5. cudf_polars/dsl/expr.py +3 -1
  6. cudf_polars/dsl/expressions/aggregation.py +27 -63
  7. cudf_polars/dsl/expressions/base.py +40 -72
  8. cudf_polars/dsl/expressions/binaryop.py +5 -41
  9. cudf_polars/dsl/expressions/boolean.py +25 -53
  10. cudf_polars/dsl/expressions/datetime.py +97 -17
  11. cudf_polars/dsl/expressions/literal.py +27 -33
  12. cudf_polars/dsl/expressions/rolling.py +110 -9
  13. cudf_polars/dsl/expressions/selection.py +8 -26
  14. cudf_polars/dsl/expressions/slicing.py +47 -0
  15. cudf_polars/dsl/expressions/sorting.py +5 -18
  16. cudf_polars/dsl/expressions/string.py +33 -36
  17. cudf_polars/dsl/expressions/ternary.py +3 -10
  18. cudf_polars/dsl/expressions/unary.py +35 -75
  19. cudf_polars/dsl/ir.py +749 -212
  20. cudf_polars/dsl/nodebase.py +8 -1
  21. cudf_polars/dsl/to_ast.py +5 -3
  22. cudf_polars/dsl/translate.py +319 -171
  23. cudf_polars/dsl/utils/__init__.py +8 -0
  24. cudf_polars/dsl/utils/aggregations.py +292 -0
  25. cudf_polars/dsl/utils/groupby.py +97 -0
  26. cudf_polars/dsl/utils/naming.py +34 -0
  27. cudf_polars/dsl/utils/replace.py +46 -0
  28. cudf_polars/dsl/utils/rolling.py +113 -0
  29. cudf_polars/dsl/utils/windows.py +186 -0
  30. cudf_polars/experimental/base.py +17 -19
  31. cudf_polars/experimental/benchmarks/__init__.py +4 -0
  32. cudf_polars/experimental/benchmarks/pdsh.py +1279 -0
  33. cudf_polars/experimental/dask_registers.py +196 -0
  34. cudf_polars/experimental/distinct.py +174 -0
  35. cudf_polars/experimental/explain.py +127 -0
  36. cudf_polars/experimental/expressions.py +521 -0
  37. cudf_polars/experimental/groupby.py +288 -0
  38. cudf_polars/experimental/io.py +58 -29
  39. cudf_polars/experimental/join.py +353 -0
  40. cudf_polars/experimental/parallel.py +166 -93
  41. cudf_polars/experimental/repartition.py +69 -0
  42. cudf_polars/experimental/scheduler.py +155 -0
  43. cudf_polars/experimental/select.py +92 -7
  44. cudf_polars/experimental/shuffle.py +294 -0
  45. cudf_polars/experimental/sort.py +45 -0
  46. cudf_polars/experimental/spilling.py +151 -0
  47. cudf_polars/experimental/utils.py +100 -0
  48. cudf_polars/testing/asserts.py +146 -6
  49. cudf_polars/testing/io.py +72 -0
  50. cudf_polars/testing/plugin.py +78 -76
  51. cudf_polars/typing/__init__.py +59 -6
  52. cudf_polars/utils/config.py +353 -0
  53. cudf_polars/utils/conversion.py +40 -0
  54. cudf_polars/utils/dtypes.py +22 -5
  55. cudf_polars/utils/timer.py +39 -0
  56. cudf_polars/utils/versions.py +5 -4
  57. {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/METADATA +10 -7
  58. cudf_polars_cu12-25.6.0.dist-info/RECORD +73 -0
  59. {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/WHEEL +1 -1
  60. cudf_polars/experimental/dask_serialize.py +0 -59
  61. cudf_polars_cu12-25.2.2.dist-info/RECORD +0 -48
  62. {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info/licenses}/LICENSE +0 -0
  63. {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/top_level.txt +0 -0
cudf_polars/VERSION CHANGED
@@ -1 +1 @@
1
- 25.02.02
1
+ 25.06.00
cudf_polars/callback.py CHANGED
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
4
  """Callback for the polars collect function to execute on device."""
@@ -7,11 +7,13 @@ from __future__ import annotations
7
7
 
8
8
  import contextlib
9
9
  import os
10
+ import time
10
11
  import warnings
11
12
  from functools import cache, partial
12
- from typing import TYPE_CHECKING, Literal
13
+ from typing import TYPE_CHECKING, Literal, overload
13
14
 
14
15
  import nvtx
16
+ from typing_extensions import assert_never
15
17
 
16
18
  from polars.exceptions import ComputeError, PerformanceWarning
17
19
 
@@ -20,6 +22,7 @@ import rmm
20
22
  from rmm._cuda import gpu
21
23
 
22
24
  from cudf_polars.dsl.translate import Translator
25
+ from cudf_polars.utils.timer import Timer
23
26
 
24
27
  if TYPE_CHECKING:
25
28
  from collections.abc import Generator
@@ -29,6 +32,7 @@ if TYPE_CHECKING:
29
32
 
30
33
  from cudf_polars.dsl.ir import IR
31
34
  from cudf_polars.typing import NodeTraverser
35
+ from cudf_polars.utils.config import ConfigOptions
32
36
 
33
37
  __all__: list[str] = ["execute_with_cudf"]
34
38
 
@@ -41,7 +45,7 @@ _SUPPORTED_PREFETCHES = {
41
45
  }
42
46
 
43
47
 
44
- def _env_get_int(name, default):
48
+ def _env_get_int(name: str, default: int) -> int:
45
49
  try:
46
50
  return int(os.getenv(name, default))
47
51
  except (ValueError, TypeError): # pragma: no cover
@@ -173,74 +177,72 @@ def set_device(device: int | None) -> Generator[int, None, None]:
173
177
  gpu.setDevice(previous)
174
178
 
175
179
 
180
+ @overload
176
181
  def _callback(
177
182
  ir: IR,
178
183
  with_columns: list[str] | None,
179
184
  pyarrow_predicate: str | None,
180
185
  n_rows: int | None,
186
+ should_time: Literal[False],
181
187
  *,
182
- device: int | None,
183
- memory_resource: int | None,
184
- executor: Literal["pylibcudf", "dask-experimental"] | None,
185
- ) -> pl.DataFrame:
188
+ memory_resource: rmm.mr.DeviceMemoryResource | None,
189
+ config_options: ConfigOptions,
190
+ timer: Timer | None,
191
+ ) -> pl.DataFrame: ...
192
+
193
+
194
+ @overload
195
+ def _callback(
196
+ ir: IR,
197
+ with_columns: list[str] | None,
198
+ pyarrow_predicate: str | None,
199
+ n_rows: int | None,
200
+ should_time: Literal[True],
201
+ *,
202
+ memory_resource: rmm.mr.DeviceMemoryResource | None,
203
+ config_options: ConfigOptions,
204
+ timer: Timer | None,
205
+ ) -> tuple[pl.DataFrame, list[tuple[int, int, str]]]: ...
206
+
207
+
208
+ def _callback(
209
+ ir: IR,
210
+ with_columns: list[str] | None,
211
+ pyarrow_predicate: str | None,
212
+ n_rows: int | None,
213
+ should_time: bool, # noqa: FBT001
214
+ *,
215
+ memory_resource: rmm.mr.DeviceMemoryResource | None,
216
+ config_options: ConfigOptions,
217
+ timer: Timer | None,
218
+ ) -> pl.DataFrame | tuple[pl.DataFrame, list[tuple[int, int, str]]]:
186
219
  assert with_columns is None
187
220
  assert pyarrow_predicate is None
188
221
  assert n_rows is None
222
+ if timer is not None:
223
+ assert should_time
189
224
  with (
190
225
  nvtx.annotate(message="ExecuteIR", domain="cudf_polars"),
191
226
  # Device must be set before memory resource is obtained.
192
- set_device(device),
227
+ set_device(config_options.device),
193
228
  set_memory_resource(memory_resource),
194
229
  ):
195
- if executor is None or executor == "pylibcudf":
196
- return ir.evaluate(cache={}).to_polars()
197
- elif executor == "dask-experimental":
198
- from cudf_polars.experimental.parallel import evaluate_dask
199
-
200
- return evaluate_dask(ir).to_polars()
201
- else:
202
- raise ValueError(f"Unknown executor '{executor}'")
203
-
204
-
205
- def validate_config_options(config: dict) -> None:
206
- """
207
- Validate the configuration options for the GPU engine.
208
-
209
- Parameters
210
- ----------
211
- config
212
- Configuration options to validate.
213
-
214
- Raises
215
- ------
216
- ValueError
217
- If the configuration contains unsupported options.
218
- """
219
- if unsupported := (
220
- config.keys()
221
- - {"raise_on_fail", "parquet_options", "executor", "executor_options"}
222
- ):
223
- raise ValueError(
224
- f"Engine configuration contains unsupported settings: {unsupported}"
225
- )
226
- assert {"chunked", "chunk_read_limit", "pass_read_limit"}.issuperset(
227
- config.get("parquet_options", {})
228
- )
229
-
230
- # Validate executor_options
231
- executor = config.get("executor", "pylibcudf")
232
- if executor == "dask-experimental":
233
- unsupported = config.get("executor_options", {}).keys() - {
234
- "max_rows_per_partition",
235
- "parquet_blocksize",
236
- }
237
- else:
238
- unsupported = config.get("executor_options", {}).keys()
239
- if unsupported:
240
- raise ValueError(f"Unsupported executor_options for {executor}: {unsupported}")
241
-
242
-
243
- def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None:
230
+ if config_options.executor.name == "in-memory":
231
+ df = ir.evaluate(cache={}, timer=timer).to_polars()
232
+ if timer is None:
233
+ return df
234
+ else:
235
+ return df, timer.timings
236
+ elif config_options.executor.name == "streaming":
237
+ from cudf_polars.experimental.parallel import evaluate_streaming
238
+
239
+ return evaluate_streaming(ir, config_options).to_polars()
240
+ assert_never(f"Unknown executor '{config_options.executor}'")
241
+
242
+
243
+ def execute_with_cudf(
244
+ nt: NodeTraverser, duration_since_start: int | None, *, config: GPUEngine
245
+ ) -> None:
244
246
  """
245
247
  A post optimization callback that attempts to execute the plan with cudf.
246
248
 
@@ -249,8 +251,12 @@ def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None:
249
251
  nt
250
252
  NodeTraverser
251
253
 
254
+ duration_since_start
255
+ Time since the user started executing the query (or None if no
256
+ profiling should occur).
257
+
252
258
  config
253
- GPUEngine configuration object
259
+ GPUEngine object. Configuration is available as ``engine.config``.
254
260
 
255
261
  Raises
256
262
  ------
@@ -263,16 +269,27 @@ def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None:
263
269
  -----
264
270
  The NodeTraverser is mutated if the libcudf executor can handle the plan.
265
271
  """
266
- device = config.device
272
+ if duration_since_start is None:
273
+ timer = None
274
+ else:
275
+ start = time.monotonic_ns()
276
+ timer = Timer(start - duration_since_start)
277
+
267
278
  memory_resource = config.memory_resource
268
- raise_on_fail = config.config.get("raise_on_fail", False)
269
- executor = config.config.get("executor", None)
270
- validate_config_options(config.config)
271
279
 
272
280
  with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
273
281
  translator = Translator(nt, config)
274
282
  ir = translator.translate_ir()
275
283
  ir_translation_errors = translator.errors
284
+ if timer is not None:
285
+ timer.store(start, time.monotonic_ns(), "gpu-ir-translation")
286
+
287
+ if (
288
+ memory_resource is None
289
+ and translator.config_options.executor.name == "streaming"
290
+ and translator.config_options.executor.scheduler == "distributed"
291
+ ): # pragma: no cover; Requires distributed cluster
292
+ memory_resource = rmm.mr.get_current_device_resource()
276
293
  if len(ir_translation_errors):
277
294
  # TODO: Display these errors in user-friendly way.
278
295
  # tracked in https://github.com/rapidsai/cudf/issues/17051
@@ -287,15 +304,15 @@ def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None:
287
304
  exception = NotImplementedError(error_message, unique_errors)
288
305
  if bool(int(os.environ.get("POLARS_VERBOSE", 0))):
289
306
  warnings.warn(error_message, PerformanceWarning, stacklevel=2)
290
- if raise_on_fail:
307
+ if translator.config_options.raise_on_fail:
291
308
  raise exception
292
309
  else:
293
310
  nt.set_udf(
294
311
  partial(
295
312
  _callback,
296
313
  ir,
297
- device=device,
298
314
  memory_resource=memory_resource,
299
- executor=executor,
315
+ config_options=translator.config_options,
316
+ timer=timer,
300
317
  )
301
318
  )
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
4
  """A column, with some properties."""
@@ -19,6 +19,7 @@ from pylibcudf.strings.convert.convert_integers import (
19
19
  )
20
20
  from pylibcudf.traits import is_floating_point
21
21
 
22
+ from cudf_polars.utils import conversion
22
23
  from cudf_polars.utils.dtypes import is_order_preserving_cast
23
24
 
24
25
  if TYPE_CHECKING:
@@ -26,6 +27,8 @@ if TYPE_CHECKING:
26
27
 
27
28
  import polars as pl
28
29
 
30
+ from cudf_polars.typing import ColumnHeader, ColumnOptions, Slice
31
+
29
32
  __all__: list[str] = ["Column"]
30
33
 
31
34
 
@@ -51,10 +54,69 @@ class Column:
51
54
  name: str | None = None,
52
55
  ):
53
56
  self.obj = column
54
- self.is_scalar = self.obj.size() == 1
57
+ self.is_scalar = self.size == 1
55
58
  self.name = name
56
59
  self.set_sorted(is_sorted=is_sorted, order=order, null_order=null_order)
57
60
 
61
+ @classmethod
62
+ def deserialize(
63
+ cls, header: ColumnHeader, frames: tuple[memoryview, plc.gpumemoryview]
64
+ ) -> Self:
65
+ """
66
+ Create a Column from a serialized representation returned by `.serialize()`.
67
+
68
+ Parameters
69
+ ----------
70
+ header
71
+ The (unpickled) metadata required to reconstruct the object.
72
+ frames
73
+ Two-tuple of frames (a memoryview and a gpumemoryview).
74
+
75
+ Returns
76
+ -------
77
+ Column
78
+ The deserialized Column.
79
+ """
80
+ packed_metadata, packed_gpu_data = frames
81
+ (plc_column,) = plc.contiguous_split.unpack_from_memoryviews(
82
+ packed_metadata, packed_gpu_data
83
+ ).columns()
84
+ return cls(plc_column, **header["column_kwargs"])
85
+
86
+ def serialize(
87
+ self,
88
+ ) -> tuple[ColumnHeader, tuple[memoryview, plc.gpumemoryview]]:
89
+ """
90
+ Serialize the Column into header and frames.
91
+
92
+ Follows the Dask serialization scheme with a picklable header (dict) and
93
+ a tuple of frames (in this case a contiguous host and device buffer).
94
+
95
+ To enable dask support, dask serializers must be registered
96
+
97
+ >>> from cudf_polars.experimental.dask_serialize import register
98
+ >>> register()
99
+
100
+ Returns
101
+ -------
102
+ header
103
+ A dict containing any picklable metadata required to reconstruct the object.
104
+ frames
105
+ Two-tuple of frames suitable for passing to `plc.contiguous_split.unpack_from_memoryviews`
106
+ """
107
+ packed = plc.contiguous_split.pack(plc.Table([self.obj]))
108
+ column_kwargs: ColumnOptions = {
109
+ "is_sorted": self.is_sorted,
110
+ "order": self.order,
111
+ "null_order": self.null_order,
112
+ "name": self.name,
113
+ }
114
+ header: ColumnHeader = {
115
+ "column_kwargs": column_kwargs,
116
+ "frame_count": 2,
117
+ }
118
+ return header, packed.release()
119
+
58
120
  @functools.cached_property
59
121
  def obj_scalar(self) -> plc.Scalar:
60
122
  """
@@ -70,9 +132,7 @@ class Column:
70
132
  If the column is not length-1.
71
133
  """
72
134
  if not self.is_scalar:
73
- raise ValueError(
74
- f"Cannot convert a column of length {self.obj.size()} to scalar"
75
- )
135
+ raise ValueError(f"Cannot convert a column of length {self.size} to scalar")
76
136
  return plc.copying.get_element(self.obj, 0)
77
137
 
78
138
  def rename(self, name: str | None, /) -> Self:
@@ -117,6 +177,44 @@ class Column:
117
177
  null_order=like.null_order,
118
178
  )
119
179
 
180
+ def check_sorted(
181
+ self,
182
+ *,
183
+ order: plc.types.Order,
184
+ null_order: plc.types.NullOrder,
185
+ ) -> bool:
186
+ """
187
+ Check if the column is sorted.
188
+
189
+ Parameters
190
+ ----------
191
+ order
192
+ The requested sort order.
193
+ null_order
194
+ Where nulls sort to.
195
+
196
+ Returns
197
+ -------
198
+ True if the column is sorted, false otherwise.
199
+
200
+ Notes
201
+ -----
202
+ If the sortedness flag is not set, this launches a kernel to
203
+ check sortedness.
204
+ """
205
+ if self.obj.size() <= 1 or self.obj.size() == self.obj.null_count():
206
+ return True
207
+ if self.is_sorted == plc.types.Sorted.YES:
208
+ return self.order == order and (
209
+ self.obj.null_count() == 0 or self.null_order == null_order
210
+ )
211
+ if plc.sorting.is_sorted(plc.Table([self.obj]), [order], [null_order]):
212
+ self.sorted = plc.types.Sorted.YES
213
+ self.order = order
214
+ self.null_order = null_order
215
+ return True
216
+ return False
217
+
120
218
  def astype(self, dtype: plc.DataType) -> Column:
121
219
  """
122
220
  Cast the column to as the requested dtype.
@@ -242,7 +340,7 @@ class Column:
242
340
  -------
243
341
  Self with metadata set.
244
342
  """
245
- if self.obj.size() <= 1:
343
+ if self.size <= 1:
246
344
  is_sorted = plc.types.Sorted.YES
247
345
  self.is_sorted = is_sorted
248
346
  self.order = order
@@ -268,7 +366,7 @@ class Column:
268
366
  def mask_nans(self) -> Self:
269
367
  """Return a shallow copy of self with nans masked out."""
270
368
  if plc.traits.is_floating_point(self.obj.type()):
271
- old_count = self.obj.null_count()
369
+ old_count = self.null_count
272
370
  mask, new_count = plc.transform.nans_to_nulls(self.obj)
273
371
  result = type(self)(self.obj.with_mask(mask, new_count))
274
372
  if old_count == new_count:
@@ -288,3 +386,36 @@ class Column:
288
386
  )
289
387
  ).as_py()
290
388
  return 0
389
+
390
+ @property
391
+ def size(self) -> int:
392
+ """Return the size of the column."""
393
+ return self.obj.size()
394
+
395
+ @property
396
+ def null_count(self) -> int:
397
+ """Return the number of Null values in the column."""
398
+ return self.obj.null_count()
399
+
400
+ def slice(self, zlice: Slice | None) -> Self:
401
+ """
402
+ Slice a column.
403
+
404
+ Parameters
405
+ ----------
406
+ zlice
407
+ optional, tuple of start and length, negative values of start
408
+ treated as for python indexing. If not provided, returns self.
409
+
410
+ Returns
411
+ -------
412
+ New column (if zlice is not None) otherwise self (if it is)
413
+ """
414
+ if zlice is None:
415
+ return self
416
+ (table,) = plc.copying.slice(
417
+ plc.Table([self.obj]),
418
+ conversion.from_polars_slice(zlice, num_rows=self.size),
419
+ )
420
+ (column,) = table.columns()
421
+ return type(self)(column, name=self.name).sorted_like(self)
@@ -1,27 +1,26 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
4
  """A dataframe, with some properties."""
5
5
 
6
6
  from __future__ import annotations
7
7
 
8
- import pickle
9
8
  from functools import cached_property
10
- from typing import TYPE_CHECKING, Any, cast
11
-
12
- import pyarrow as pa
9
+ from typing import TYPE_CHECKING, cast
13
10
 
14
11
  import polars as pl
15
12
 
16
13
  import pylibcudf as plc
17
14
 
18
15
  from cudf_polars.containers import Column
19
- from cudf_polars.utils import dtypes
16
+ from cudf_polars.utils import conversion
20
17
 
21
18
  if TYPE_CHECKING:
22
19
  from collections.abc import Iterable, Mapping, Sequence, Set
23
20
 
24
- from typing_extensions import Self
21
+ from typing_extensions import Any, Self
22
+
23
+ from cudf_polars.typing import ColumnOptions, DataFrameHeader, Slice
25
24
 
26
25
 
27
26
  __all__: list[str] = ["DataFrame"]
@@ -107,17 +106,12 @@ class DataFrame:
107
106
  -------
108
107
  New dataframe representing the input.
109
108
  """
110
- table = df.to_arrow()
111
- schema = table.schema
112
- for i, field in enumerate(schema):
113
- schema = schema.set(
114
- i, pa.field(field.name, dtypes.downcast_arrow_lists(field.type))
115
- )
116
- # No-op if the schema is unchanged.
117
- d_table = plc.interop.from_arrow(table.cast(schema))
109
+ plc_table = plc.Table(df)
118
110
  return cls(
119
- Column(column).copy_metadata(h_col)
120
- for column, h_col in zip(d_table.columns(), df.iter_columns(), strict=True)
111
+ Column(d_col, name=name).copy_metadata(h_col)
112
+ for d_col, h_col, name in zip(
113
+ plc_table.columns(), df.iter_columns(), df.columns, strict=True
114
+ )
121
115
  )
122
116
 
123
117
  @classmethod
@@ -150,7 +144,7 @@ class DataFrame:
150
144
 
151
145
  @classmethod
152
146
  def deserialize(
153
- cls, header: Mapping[str, Any], frames: tuple[memoryview, plc.gpumemoryview]
147
+ cls, header: DataFrameHeader, frames: tuple[memoryview, plc.gpumemoryview]
154
148
  ) -> Self:
155
149
  """
156
150
  Create a DataFrame from a serialized representation returned by `.serialize()`.
@@ -178,7 +172,7 @@ class DataFrame:
178
172
 
179
173
  def serialize(
180
174
  self,
181
- ) -> tuple[Mapping[str, Any], tuple[memoryview, plc.gpumemoryview]]:
175
+ ) -> tuple[DataFrameHeader, tuple[memoryview, plc.gpumemoryview]]:
182
176
  """
183
177
  Serialize the table into header and frames.
184
178
 
@@ -187,20 +181,20 @@ class DataFrame:
187
181
 
188
182
  To enable dask support, dask serializers must be registered
189
183
 
190
- >>> from cudf_polars.experimental.dask_serialize import register
191
- >>> register()
184
+ >>> from cudf_polars.experimental.dask_serialize import register
185
+ >>> register()
192
186
 
193
187
  Returns
194
188
  -------
195
189
  header
196
190
  A dict containing any picklable metadata required to reconstruct the object.
197
191
  frames
198
- Two-tuple of frames suitable for passing to `unpack_from_memoryviews`
192
+ Two-tuple of frames suitable for passing to `plc.contiguous_split.unpack_from_memoryviews`
199
193
  """
200
194
  packed = plc.contiguous_split.pack(self.table)
201
195
 
202
196
  # Keyword arguments for `Column.__init__`.
203
- columns_kwargs = [
197
+ columns_kwargs: list[ColumnOptions] = [
204
198
  {
205
199
  "is_sorted": col.is_sorted,
206
200
  "order": col.order,
@@ -209,10 +203,8 @@ class DataFrame:
209
203
  }
210
204
  for col in self.columns
211
205
  ]
212
- header = {
206
+ header: DataFrameHeader = {
213
207
  "columns_kwargs": columns_kwargs,
214
- # Dask Distributed uses "type-serialized" to dispatch deserialization
215
- "type-serialized": pickle.dumps(type(self)),
216
208
  "frame_count": 2,
217
209
  }
218
210
  return header, packed.release()
@@ -247,7 +239,9 @@ class DataFrame:
247
239
  for c, other in zip(self.columns, like.columns, strict=True)
248
240
  )
249
241
 
250
- def with_columns(self, columns: Iterable[Column], *, replace_only=False) -> Self:
242
+ def with_columns(
243
+ self, columns: Iterable[Column], *, replace_only: bool = False
244
+ ) -> Self:
251
245
  """
252
246
  Return a new dataframe with extra columns.
253
247
 
@@ -276,7 +270,7 @@ class DataFrame:
276
270
  """Drop columns by name."""
277
271
  return type(self)(column for column in self.columns if column.name not in names)
278
272
 
279
- def select(self, names: Sequence[str]) -> Self:
273
+ def select(self, names: Sequence[str] | Mapping[str, Any]) -> Self:
280
274
  """Select columns by name returning DataFrame."""
281
275
  try:
282
276
  return type(self)(self.column_map[name] for name in names)
@@ -296,7 +290,7 @@ class DataFrame:
296
290
  table = plc.stream_compaction.apply_boolean_mask(self.table, mask.obj)
297
291
  return type(self).from_table(table, self.column_names).sorted_like(self)
298
292
 
299
- def slice(self, zlice: tuple[int, int] | None) -> Self:
293
+ def slice(self, zlice: Slice | None) -> Self:
300
294
  """
301
295
  Slice a dataframe.
302
296
 
@@ -312,14 +306,7 @@ class DataFrame:
312
306
  """
313
307
  if zlice is None:
314
308
  return self
315
- start, length = zlice
316
- if start < 0:
317
- start += self.num_rows
318
- # Polars implementation wraps negative start by num_rows, then
319
- # adds length to start to get the end, then clamps both to
320
- # [0, num_rows)
321
- end = start + length
322
- start = max(min(start, self.num_rows), 0)
323
- end = max(min(end, self.num_rows), 0)
324
- (table,) = plc.copying.slice(self.table, [start, end])
309
+ (table,) = plc.copying.slice(
310
+ self.table, conversion.from_polars_slice(zlice, num_rows=self.num_rows)
311
+ )
325
312
  return type(self).from_table(table, self.column_names).sorted_like(self)
cudf_polars/dsl/expr.py CHANGED
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
  # TODO: remove need for this
4
4
  # ruff: noqa: D101
@@ -30,6 +30,7 @@ from cudf_polars.dsl.expressions.datetime import TemporalFunction
30
30
  from cudf_polars.dsl.expressions.literal import Literal, LiteralColumn
31
31
  from cudf_polars.dsl.expressions.rolling import GroupedRollingWindow, RollingWindow
32
32
  from cudf_polars.dsl.expressions.selection import Filter, Gather
33
+ from cudf_polars.dsl.expressions.slicing import Slice
33
34
  from cudf_polars.dsl.expressions.sorting import Sort, SortBy
34
35
  from cudf_polars.dsl.expressions.string import StringFunction
35
36
  from cudf_polars.dsl.expressions.ternary import Ternary
@@ -53,6 +54,7 @@ __all__ = [
53
54
  "LiteralColumn",
54
55
  "NamedExpr",
55
56
  "RollingWindow",
57
+ "Slice",
56
58
  "Sort",
57
59
  "SortBy",
58
60
  "StringFunction",