cudf-polars-cu12 25.2.1__py3-none-any.whl → 25.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. cudf_polars/VERSION +1 -1
  2. cudf_polars/callback.py +85 -53
  3. cudf_polars/containers/column.py +100 -7
  4. cudf_polars/containers/dataframe.py +16 -24
  5. cudf_polars/dsl/expr.py +3 -1
  6. cudf_polars/dsl/expressions/aggregation.py +3 -3
  7. cudf_polars/dsl/expressions/binaryop.py +2 -2
  8. cudf_polars/dsl/expressions/boolean.py +4 -4
  9. cudf_polars/dsl/expressions/datetime.py +39 -1
  10. cudf_polars/dsl/expressions/literal.py +3 -9
  11. cudf_polars/dsl/expressions/selection.py +2 -2
  12. cudf_polars/dsl/expressions/slicing.py +53 -0
  13. cudf_polars/dsl/expressions/sorting.py +1 -1
  14. cudf_polars/dsl/expressions/string.py +4 -4
  15. cudf_polars/dsl/expressions/unary.py +3 -2
  16. cudf_polars/dsl/ir.py +222 -93
  17. cudf_polars/dsl/nodebase.py +8 -1
  18. cudf_polars/dsl/translate.py +66 -38
  19. cudf_polars/experimental/base.py +18 -12
  20. cudf_polars/experimental/dask_serialize.py +22 -8
  21. cudf_polars/experimental/groupby.py +346 -0
  22. cudf_polars/experimental/io.py +13 -11
  23. cudf_polars/experimental/join.py +318 -0
  24. cudf_polars/experimental/parallel.py +57 -6
  25. cudf_polars/experimental/shuffle.py +194 -0
  26. cudf_polars/testing/plugin.py +23 -34
  27. cudf_polars/typing/__init__.py +33 -2
  28. cudf_polars/utils/config.py +138 -0
  29. cudf_polars/utils/conversion.py +40 -0
  30. cudf_polars/utils/dtypes.py +14 -4
  31. cudf_polars/utils/timer.py +39 -0
  32. cudf_polars/utils/versions.py +4 -3
  33. {cudf_polars_cu12-25.2.1.dist-info → cudf_polars_cu12-25.4.0.dist-info}/METADATA +8 -7
  34. cudf_polars_cu12-25.4.0.dist-info/RECORD +55 -0
  35. {cudf_polars_cu12-25.2.1.dist-info → cudf_polars_cu12-25.4.0.dist-info}/WHEEL +1 -1
  36. cudf_polars_cu12-25.2.1.dist-info/RECORD +0 -48
  37. {cudf_polars_cu12-25.2.1.dist-info → cudf_polars_cu12-25.4.0.dist-info/licenses}/LICENSE +0 -0
  38. {cudf_polars_cu12-25.2.1.dist-info → cudf_polars_cu12-25.4.0.dist-info}/top_level.txt +0 -0
cudf_polars/VERSION CHANGED
@@ -1 +1 @@
1
- 25.02.01
1
+ 25.04.00
cudf_polars/callback.py CHANGED
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
4
  """Callback for the polars collect function to execute on device."""
@@ -7,9 +7,10 @@ from __future__ import annotations
7
7
 
8
8
  import contextlib
9
9
  import os
10
+ import time
10
11
  import warnings
11
12
  from functools import cache, partial
12
- from typing import TYPE_CHECKING, Literal
13
+ from typing import TYPE_CHECKING, Literal, overload
13
14
 
14
15
  import nvtx
15
16
 
@@ -20,6 +21,8 @@ import rmm
20
21
  from rmm._cuda import gpu
21
22
 
22
23
  from cudf_polars.dsl.translate import Translator
24
+ from cudf_polars.utils.timer import Timer
25
+ from cudf_polars.utils.versions import POLARS_VERSION_LT_125
23
26
 
24
27
  if TYPE_CHECKING:
25
28
  from collections.abc import Generator
@@ -173,19 +176,53 @@ def set_device(device: int | None) -> Generator[int, None, None]:
173
176
  gpu.setDevice(previous)
174
177
 
175
178
 
179
+ @overload
176
180
  def _callback(
177
181
  ir: IR,
178
182
  with_columns: list[str] | None,
179
183
  pyarrow_predicate: str | None,
180
184
  n_rows: int | None,
185
+ should_time: Literal[False],
181
186
  *,
182
187
  device: int | None,
183
188
  memory_resource: int | None,
184
189
  executor: Literal["pylibcudf", "dask-experimental"] | None,
185
- ) -> pl.DataFrame:
190
+ timer: Timer | None,
191
+ ) -> pl.DataFrame: ...
192
+
193
+
194
+ @overload
195
+ def _callback(
196
+ ir: IR,
197
+ with_columns: list[str] | None,
198
+ pyarrow_predicate: str | None,
199
+ n_rows: int | None,
200
+ should_time: Literal[True],
201
+ *,
202
+ device: int | None,
203
+ memory_resource: int | None,
204
+ executor: Literal["pylibcudf", "dask-experimental"] | None,
205
+ timer: Timer | None,
206
+ ) -> tuple[pl.DataFrame, list[tuple[int, int, str]]]: ...
207
+
208
+
209
+ def _callback(
210
+ ir: IR,
211
+ with_columns: list[str] | None,
212
+ pyarrow_predicate: str | None,
213
+ n_rows: int | None,
214
+ should_time: bool, # noqa: FBT001
215
+ *,
216
+ device: int | None,
217
+ memory_resource: int | None,
218
+ executor: Literal["pylibcudf", "dask-experimental"] | None,
219
+ timer: Timer | None,
220
+ ):
186
221
  assert with_columns is None
187
222
  assert pyarrow_predicate is None
188
223
  assert n_rows is None
224
+ if timer is not None:
225
+ assert should_time
189
226
  with (
190
227
  nvtx.annotate(message="ExecuteIR", domain="cudf_polars"),
191
228
  # Device must be set before memory resource is obtained.
@@ -193,7 +230,11 @@ def _callback(
193
230
  set_memory_resource(memory_resource),
194
231
  ):
195
232
  if executor is None or executor == "pylibcudf":
196
- return ir.evaluate(cache={}).to_polars()
233
+ df = ir.evaluate(cache={}, timer=timer).to_polars()
234
+ if timer is None:
235
+ return df
236
+ else:
237
+ return df, timer.timings
197
238
  elif executor == "dask-experimental":
198
239
  from cudf_polars.experimental.parallel import evaluate_dask
199
240
 
@@ -202,45 +243,9 @@ def _callback(
202
243
  raise ValueError(f"Unknown executor '{executor}'")
203
244
 
204
245
 
205
- def validate_config_options(config: dict) -> None:
206
- """
207
- Validate the configuration options for the GPU engine.
208
-
209
- Parameters
210
- ----------
211
- config
212
- Configuration options to validate.
213
-
214
- Raises
215
- ------
216
- ValueError
217
- If the configuration contains unsupported options.
218
- """
219
- if unsupported := (
220
- config.keys()
221
- - {"raise_on_fail", "parquet_options", "executor", "executor_options"}
222
- ):
223
- raise ValueError(
224
- f"Engine configuration contains unsupported settings: {unsupported}"
225
- )
226
- assert {"chunked", "chunk_read_limit", "pass_read_limit"}.issuperset(
227
- config.get("parquet_options", {})
228
- )
229
-
230
- # Validate executor_options
231
- executor = config.get("executor", "pylibcudf")
232
- if executor == "dask-experimental":
233
- unsupported = config.get("executor_options", {}).keys() - {
234
- "max_rows_per_partition",
235
- "parquet_blocksize",
236
- }
237
- else:
238
- unsupported = config.get("executor_options", {}).keys()
239
- if unsupported:
240
- raise ValueError(f"Unsupported executor_options for {executor}: {unsupported}")
241
-
242
-
243
- def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None:
246
+ def execute_with_cudf(
247
+ nt: NodeTraverser, duration_since_start: int | None, *, config: GPUEngine
248
+ ) -> None:
244
249
  """
245
250
  A post optimization callback that attempts to execute the plan with cudf.
246
251
 
@@ -249,6 +254,10 @@ def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None:
249
254
  nt
250
255
  NodeTraverser
251
256
 
257
+ duration_since_start
258
+ Time since the user started executing the query (or None if no
259
+ profiling should occur).
260
+
252
261
  config
253
262
  GPUEngine configuration object
254
263
 
@@ -263,16 +272,21 @@ def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None:
263
272
  -----
264
273
  The NodeTraverser is mutated if the libcudf executor can handle the plan.
265
274
  """
275
+ if duration_since_start is None:
276
+ timer = None
277
+ else:
278
+ start = time.monotonic_ns()
279
+ timer = Timer(start - duration_since_start)
266
280
  device = config.device
267
281
  memory_resource = config.memory_resource
268
282
  raise_on_fail = config.config.get("raise_on_fail", False)
269
283
  executor = config.config.get("executor", None)
270
- validate_config_options(config.config)
271
-
272
284
  with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
273
285
  translator = Translator(nt, config)
274
286
  ir = translator.translate_ir()
275
287
  ir_translation_errors = translator.errors
288
+ if timer is not None:
289
+ timer.store(start, time.monotonic_ns(), "gpu-ir-translation")
276
290
  if len(ir_translation_errors):
277
291
  # TODO: Display these errors in user-friendly way.
278
292
  # tracked in https://github.com/rapidsai/cudf/issues/17051
@@ -290,12 +304,30 @@ def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None:
290
304
  if raise_on_fail:
291
305
  raise exception
292
306
  else:
293
- nt.set_udf(
294
- partial(
295
- _callback,
296
- ir,
297
- device=device,
298
- memory_resource=memory_resource,
299
- executor=executor,
307
+ if POLARS_VERSION_LT_125: # pragma: no cover
308
+ nt.set_udf(
309
+ partial(
310
+ _callback,
311
+ ir,
312
+ should_time=False,
313
+ device=device,
314
+ memory_resource=memory_resource,
315
+ executor=executor,
316
+ timer=None,
317
+ )
300
318
  )
301
- )
319
+ else:
320
+ nt.set_udf(
321
+ partial(
322
+ _callback,
323
+ ir,
324
+ device=device,
325
+ memory_resource=memory_resource,
326
+ executor=executor,
327
+ timer=timer,
328
+ )
329
+ )
330
+
331
+
332
+ if POLARS_VERSION_LT_125: # pragma: no cover
333
+ execute_with_cudf = partial(execute_with_cudf, duration_since_start=None)
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
4
  """A column, with some properties."""
@@ -19,6 +19,7 @@ from pylibcudf.strings.convert.convert_integers import (
19
19
  )
20
20
  from pylibcudf.traits import is_floating_point
21
21
 
22
+ from cudf_polars.utils import conversion
22
23
  from cudf_polars.utils.dtypes import is_order_preserving_cast
23
24
 
24
25
  if TYPE_CHECKING:
@@ -26,6 +27,8 @@ if TYPE_CHECKING:
26
27
 
27
28
  import polars as pl
28
29
 
30
+ from cudf_polars.typing import ColumnHeader, ColumnOptions, Slice
31
+
29
32
  __all__: list[str] = ["Column"]
30
33
 
31
34
 
@@ -51,10 +54,69 @@ class Column:
51
54
  name: str | None = None,
52
55
  ):
53
56
  self.obj = column
54
- self.is_scalar = self.obj.size() == 1
57
+ self.is_scalar = self.size == 1
55
58
  self.name = name
56
59
  self.set_sorted(is_sorted=is_sorted, order=order, null_order=null_order)
57
60
 
61
+ @classmethod
62
+ def deserialize(
63
+ cls, header: ColumnHeader, frames: tuple[memoryview, plc.gpumemoryview]
64
+ ) -> Self:
65
+ """
66
+ Create a Column from a serialized representation returned by `.serialize()`.
67
+
68
+ Parameters
69
+ ----------
70
+ header
71
+ The (unpickled) metadata required to reconstruct the object.
72
+ frames
73
+ Two-tuple of frames (a memoryview and a gpumemoryview).
74
+
75
+ Returns
76
+ -------
77
+ Column
78
+ The deserialized Column.
79
+ """
80
+ packed_metadata, packed_gpu_data = frames
81
+ (plc_column,) = plc.contiguous_split.unpack_from_memoryviews(
82
+ packed_metadata, packed_gpu_data
83
+ ).columns()
84
+ return cls(plc_column, **header["column_kwargs"])
85
+
86
+ def serialize(
87
+ self,
88
+ ) -> tuple[ColumnHeader, tuple[memoryview, plc.gpumemoryview]]:
89
+ """
90
+ Serialize the Column into header and frames.
91
+
92
+ Follows the Dask serialization scheme with a picklable header (dict) and
93
+ a tuple of frames (in this case a contiguous host and device buffer).
94
+
95
+ To enable dask support, dask serializers must be registered
96
+
97
+ >>> from cudf_polars.experimental.dask_serialize import register
98
+ >>> register()
99
+
100
+ Returns
101
+ -------
102
+ header
103
+ A dict containing any picklable metadata required to reconstruct the object.
104
+ frames
105
+ Two-tuple of frames suitable for passing to `plc.contiguous_split.unpack_from_memoryviews`
106
+ """
107
+ packed = plc.contiguous_split.pack(plc.Table([self.obj]))
108
+ column_kwargs: ColumnOptions = {
109
+ "is_sorted": self.is_sorted,
110
+ "order": self.order,
111
+ "null_order": self.null_order,
112
+ "name": self.name,
113
+ }
114
+ header: ColumnHeader = {
115
+ "column_kwargs": column_kwargs,
116
+ "frame_count": 2,
117
+ }
118
+ return header, packed.release()
119
+
58
120
  @functools.cached_property
59
121
  def obj_scalar(self) -> plc.Scalar:
60
122
  """
@@ -70,9 +132,7 @@ class Column:
70
132
  If the column is not length-1.
71
133
  """
72
134
  if not self.is_scalar:
73
- raise ValueError(
74
- f"Cannot convert a column of length {self.obj.size()} to scalar"
75
- )
135
+ raise ValueError(f"Cannot convert a column of length {self.size} to scalar")
76
136
  return plc.copying.get_element(self.obj, 0)
77
137
 
78
138
  def rename(self, name: str | None, /) -> Self:
@@ -242,7 +302,7 @@ class Column:
242
302
  -------
243
303
  Self with metadata set.
244
304
  """
245
- if self.obj.size() <= 1:
305
+ if self.size <= 1:
246
306
  is_sorted = plc.types.Sorted.YES
247
307
  self.is_sorted = is_sorted
248
308
  self.order = order
@@ -268,7 +328,7 @@ class Column:
268
328
  def mask_nans(self) -> Self:
269
329
  """Return a shallow copy of self with nans masked out."""
270
330
  if plc.traits.is_floating_point(self.obj.type()):
271
- old_count = self.obj.null_count()
331
+ old_count = self.null_count
272
332
  mask, new_count = plc.transform.nans_to_nulls(self.obj)
273
333
  result = type(self)(self.obj.with_mask(mask, new_count))
274
334
  if old_count == new_count:
@@ -288,3 +348,36 @@ class Column:
288
348
  )
289
349
  ).as_py()
290
350
  return 0
351
+
352
+ @property
353
+ def size(self) -> int:
354
+ """Return the size of the column."""
355
+ return self.obj.size()
356
+
357
+ @property
358
+ def null_count(self) -> int:
359
+ """Return the number of Null values in the column."""
360
+ return self.obj.null_count()
361
+
362
+ def slice(self, zlice: Slice | None) -> Self:
363
+ """
364
+ Slice a column.
365
+
366
+ Parameters
367
+ ----------
368
+ zlice
369
+ optional, tuple of start and length, negative values of start
370
+ treated as for python indexing. If not provided, returns self.
371
+
372
+ Returns
373
+ -------
374
+ New column (if zlice is not None) otherwise self (if it is)
375
+ """
376
+ if zlice is None:
377
+ return self
378
+ (table,) = plc.copying.slice(
379
+ plc.Table([self.obj]),
380
+ conversion.from_polars_slice(zlice, num_rows=self.size),
381
+ )
382
+ (column,) = table.columns()
383
+ return type(self)(column, name=self.name).sorted_like(self)
@@ -1,13 +1,12 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
4
  """A dataframe, with some properties."""
5
5
 
6
6
  from __future__ import annotations
7
7
 
8
- import pickle
9
8
  from functools import cached_property
10
- from typing import TYPE_CHECKING, Any, cast
9
+ from typing import TYPE_CHECKING, cast
11
10
 
12
11
  import pyarrow as pa
13
12
 
@@ -16,13 +15,15 @@ import polars as pl
16
15
  import pylibcudf as plc
17
16
 
18
17
  from cudf_polars.containers import Column
19
- from cudf_polars.utils import dtypes
18
+ from cudf_polars.utils import conversion, dtypes
20
19
 
21
20
  if TYPE_CHECKING:
22
21
  from collections.abc import Iterable, Mapping, Sequence, Set
23
22
 
24
23
  from typing_extensions import Self
25
24
 
25
+ from cudf_polars.typing import ColumnOptions, DataFrameHeader, Slice
26
+
26
27
 
27
28
  __all__: list[str] = ["DataFrame"]
28
29
 
@@ -150,7 +151,7 @@ class DataFrame:
150
151
 
151
152
  @classmethod
152
153
  def deserialize(
153
- cls, header: Mapping[str, Any], frames: tuple[memoryview, plc.gpumemoryview]
154
+ cls, header: DataFrameHeader, frames: tuple[memoryview, plc.gpumemoryview]
154
155
  ) -> Self:
155
156
  """
156
157
  Create a DataFrame from a serialized representation returned by `.serialize()`.
@@ -178,7 +179,7 @@ class DataFrame:
178
179
 
179
180
  def serialize(
180
181
  self,
181
- ) -> tuple[Mapping[str, Any], tuple[memoryview, plc.gpumemoryview]]:
182
+ ) -> tuple[DataFrameHeader, tuple[memoryview, plc.gpumemoryview]]:
182
183
  """
183
184
  Serialize the table into header and frames.
184
185
 
@@ -187,20 +188,20 @@ class DataFrame:
187
188
 
188
189
  To enable dask support, dask serializers must be registered
189
190
 
190
- >>> from cudf_polars.experimental.dask_serialize import register
191
- >>> register()
191
+ >>> from cudf_polars.experimental.dask_serialize import register
192
+ >>> register()
192
193
 
193
194
  Returns
194
195
  -------
195
196
  header
196
197
  A dict containing any picklable metadata required to reconstruct the object.
197
198
  frames
198
- Two-tuple of frames suitable for passing to `unpack_from_memoryviews`
199
+ Two-tuple of frames suitable for passing to `plc.contiguous_split.unpack_from_memoryviews`
199
200
  """
200
201
  packed = plc.contiguous_split.pack(self.table)
201
202
 
202
203
  # Keyword arguments for `Column.__init__`.
203
- columns_kwargs = [
204
+ columns_kwargs: list[ColumnOptions] = [
204
205
  {
205
206
  "is_sorted": col.is_sorted,
206
207
  "order": col.order,
@@ -209,10 +210,8 @@ class DataFrame:
209
210
  }
210
211
  for col in self.columns
211
212
  ]
212
- header = {
213
+ header: DataFrameHeader = {
213
214
  "columns_kwargs": columns_kwargs,
214
- # Dask Distributed uses "type-serialized" to dispatch deserialization
215
- "type-serialized": pickle.dumps(type(self)),
216
215
  "frame_count": 2,
217
216
  }
218
217
  return header, packed.release()
@@ -296,7 +295,7 @@ class DataFrame:
296
295
  table = plc.stream_compaction.apply_boolean_mask(self.table, mask.obj)
297
296
  return type(self).from_table(table, self.column_names).sorted_like(self)
298
297
 
299
- def slice(self, zlice: tuple[int, int] | None) -> Self:
298
+ def slice(self, zlice: Slice | None) -> Self:
300
299
  """
301
300
  Slice a dataframe.
302
301
 
@@ -312,14 +311,7 @@ class DataFrame:
312
311
  """
313
312
  if zlice is None:
314
313
  return self
315
- start, length = zlice
316
- if start < 0:
317
- start += self.num_rows
318
- # Polars implementation wraps negative start by num_rows, then
319
- # adds length to start to get the end, then clamps both to
320
- # [0, num_rows)
321
- end = start + length
322
- start = max(min(start, self.num_rows), 0)
323
- end = max(min(end, self.num_rows), 0)
324
- (table,) = plc.copying.slice(self.table, [start, end])
314
+ (table,) = plc.copying.slice(
315
+ self.table, conversion.from_polars_slice(zlice, num_rows=self.num_rows)
316
+ )
325
317
  return type(self).from_table(table, self.column_names).sorted_like(self)
cudf_polars/dsl/expr.py CHANGED
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
  # TODO: remove need for this
4
4
  # ruff: noqa: D101
@@ -30,6 +30,7 @@ from cudf_polars.dsl.expressions.datetime import TemporalFunction
30
30
  from cudf_polars.dsl.expressions.literal import Literal, LiteralColumn
31
31
  from cudf_polars.dsl.expressions.rolling import GroupedRollingWindow, RollingWindow
32
32
  from cudf_polars.dsl.expressions.selection import Filter, Gather
33
+ from cudf_polars.dsl.expressions.slicing import Slice
33
34
  from cudf_polars.dsl.expressions.sorting import Sort, SortBy
34
35
  from cudf_polars.dsl.expressions.string import StringFunction
35
36
  from cudf_polars.dsl.expressions.ternary import Ternary
@@ -53,6 +54,7 @@ __all__ = [
53
54
  "LiteralColumn",
54
55
  "NamedExpr",
55
56
  "RollingWindow",
57
+ "Slice",
56
58
  "Sort",
57
59
  "SortBy",
58
60
  "StringFunction",
@@ -172,7 +172,7 @@ class Agg(Expr):
172
172
  plc.Column.from_scalar(
173
173
  plc.interop.from_arrow(
174
174
  pa.scalar(
175
- column.obj.size() - column.obj.null_count(),
175
+ column.size - column.null_count,
176
176
  type=plc.interop.to_arrow(self.dtype),
177
177
  ),
178
178
  ),
@@ -181,7 +181,7 @@ class Agg(Expr):
181
181
  )
182
182
 
183
183
  def _sum(self, column: Column) -> Column:
184
- if column.obj.size() == 0:
184
+ if column.size == 0 or column.null_count == column.size:
185
185
  return Column(
186
186
  plc.Column.from_scalar(
187
187
  plc.interop.from_arrow(
@@ -224,7 +224,7 @@ class Agg(Expr):
224
224
  return Column(plc.copying.slice(column.obj, [0, 1])[0])
225
225
 
226
226
  def _last(self, column: Column) -> Column:
227
- n = column.obj.size()
227
+ n = column.size
228
228
  return Column(plc.copying.slice(column.obj, [n - 1, n])[0])
229
229
 
230
230
  def do_evaluate(
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
  # TODO: remove need for this
4
4
  # ruff: noqa: D101
@@ -98,7 +98,7 @@ class BinOp(Expr):
98
98
  )
99
99
  lop = left.obj
100
100
  rop = right.obj
101
- if left.obj.size() != right.obj.size():
101
+ if left.size != right.size:
102
102
  if left.is_scalar:
103
103
  lop = left.obj_scalar
104
104
  elif right.is_scalar:
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
  # TODO: remove need for this
4
4
  # ruff: noqa: D101
@@ -191,7 +191,7 @@ class BooleanFunction(Expr):
191
191
  is_any = self.name is BooleanFunction.Name.Any
192
192
  agg = plc.aggregation.any() if is_any else plc.aggregation.all()
193
193
  result = plc.reduce.reduce(column.obj, agg, self.dtype)
194
- if not ignore_nulls and column.obj.null_count() > 0:
194
+ if not ignore_nulls and column.null_count > 0:
195
195
  # Truth tables
196
196
  # Any All
197
197
  # | F U T | F U T
@@ -218,14 +218,14 @@ class BooleanFunction(Expr):
218
218
  (column,) = columns
219
219
  return Column(
220
220
  plc.unary.is_nan(column.obj).with_mask(
221
- column.obj.null_mask(), column.obj.null_count()
221
+ column.obj.null_mask(), column.null_count
222
222
  )
223
223
  )
224
224
  elif self.name is BooleanFunction.Name.IsNotNan:
225
225
  (column,) = columns
226
226
  return Column(
227
227
  plc.unary.is_not_nan(column.obj).with_mask(
228
- column.obj.null_mask(), column.obj.null_count()
228
+ column.obj.null_mask(), column.null_count
229
229
  )
230
230
  )
231
231
  elif self.name is BooleanFunction.Name.IsFirstDistinct:
@@ -104,6 +104,14 @@ class TemporalFunction(Expr):
104
104
  Name.Nanosecond: plc.datetime.DatetimeComponent.NANOSECOND,
105
105
  }
106
106
 
107
+ _valid_ops: ClassVar[set[Name]] = {
108
+ *_COMPONENT_MAP.keys(),
109
+ Name.IsLeapYear,
110
+ Name.OrdinalDay,
111
+ Name.MonthStart,
112
+ Name.MonthEnd,
113
+ }
114
+
107
115
  def __init__(
108
116
  self,
109
117
  dtype: plc.DataType,
@@ -116,7 +124,7 @@ class TemporalFunction(Expr):
116
124
  self.name = name
117
125
  self.children = children
118
126
  self.is_pointwise = True
119
- if self.name not in self._COMPONENT_MAP:
127
+ if self.name not in self._valid_ops:
120
128
  raise NotImplementedError(f"Temporal function {self.name}")
121
129
 
122
130
  def do_evaluate(
@@ -132,6 +140,36 @@ class TemporalFunction(Expr):
132
140
  for child in self.children
133
141
  ]
134
142
  (column,) = columns
143
+ if self.name is TemporalFunction.Name.MonthStart:
144
+ ends = plc.datetime.last_day_of_month(column.obj)
145
+ days_to_subtract = plc.datetime.days_in_month(column.obj)
146
+ # must subtract 1 to avoid rolling over to the previous month
147
+ days_to_subtract = plc.binaryop.binary_operation(
148
+ days_to_subtract,
149
+ plc.interop.from_arrow(pa.scalar(1, type=pa.int32())),
150
+ plc.binaryop.BinaryOperator.SUB,
151
+ plc.DataType(plc.TypeId.DURATION_DAYS),
152
+ )
153
+ result = plc.binaryop.binary_operation(
154
+ ends,
155
+ days_to_subtract,
156
+ plc.binaryop.BinaryOperator.SUB,
157
+ column.obj.type(),
158
+ )
159
+
160
+ return Column(result)
161
+ if self.name is TemporalFunction.Name.MonthEnd:
162
+ return Column(
163
+ plc.unary.cast(
164
+ plc.datetime.last_day_of_month(column.obj), column.obj.type()
165
+ )
166
+ )
167
+ if self.name is TemporalFunction.Name.IsLeapYear:
168
+ return Column(
169
+ plc.datetime.is_leap_year(column.obj),
170
+ )
171
+ if self.name is TemporalFunction.Name.OrdinalDay:
172
+ return Column(plc.datetime.day_of_year(column.obj))
135
173
  if self.name is TemporalFunction.Name.Microsecond:
136
174
  millis = plc.datetime.extract_datetime_component(
137
175
  column.obj, plc.datetime.DatetimeComponent.MILLISECOND
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
  # TODO: remove need for this
4
4
  # ruff: noqa: D101
@@ -8,21 +8,16 @@ from __future__ import annotations
8
8
 
9
9
  from typing import TYPE_CHECKING, Any
10
10
 
11
- import pyarrow as pa
12
-
13
11
  import pylibcudf as plc
14
12
 
15
13
  from cudf_polars.containers import Column
16
14
  from cudf_polars.dsl.expressions.base import AggInfo, ExecutionContext, Expr
17
- from cudf_polars.utils import dtypes
18
15
 
19
16
  if TYPE_CHECKING:
20
17
  from collections.abc import Hashable, Mapping
21
18
 
22
19
  import pyarrow as pa
23
20
 
24
- import polars as pl
25
-
26
21
  from cudf_polars.containers import DataFrame
27
22
 
28
23
  __all__ = ["Literal", "LiteralColumn"]
@@ -61,10 +56,9 @@ class LiteralColumn(Expr):
61
56
  _non_child = ("dtype", "value")
62
57
  value: pa.Array[Any]
63
58
 
64
- def __init__(self, dtype: plc.DataType, value: pl.Series) -> None:
59
+ def __init__(self, dtype: plc.DataType, value: pa.Array) -> None:
65
60
  self.dtype = dtype
66
- data = value.to_arrow()
67
- self.value = data.cast(dtypes.downcast_arrow_lists(data.type))
61
+ self.value = value
68
62
  self.children = ()
69
63
  self.is_pointwise = True
70
64