cudf-polars-cu13 25.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. cudf_polars/GIT_COMMIT +1 -0
  2. cudf_polars/VERSION +1 -0
  3. cudf_polars/__init__.py +28 -0
  4. cudf_polars/_version.py +21 -0
  5. cudf_polars/callback.py +318 -0
  6. cudf_polars/containers/__init__.py +13 -0
  7. cudf_polars/containers/column.py +495 -0
  8. cudf_polars/containers/dataframe.py +361 -0
  9. cudf_polars/containers/datatype.py +137 -0
  10. cudf_polars/dsl/__init__.py +8 -0
  11. cudf_polars/dsl/expr.py +66 -0
  12. cudf_polars/dsl/expressions/__init__.py +8 -0
  13. cudf_polars/dsl/expressions/aggregation.py +226 -0
  14. cudf_polars/dsl/expressions/base.py +272 -0
  15. cudf_polars/dsl/expressions/binaryop.py +120 -0
  16. cudf_polars/dsl/expressions/boolean.py +326 -0
  17. cudf_polars/dsl/expressions/datetime.py +271 -0
  18. cudf_polars/dsl/expressions/literal.py +97 -0
  19. cudf_polars/dsl/expressions/rolling.py +643 -0
  20. cudf_polars/dsl/expressions/selection.py +74 -0
  21. cudf_polars/dsl/expressions/slicing.py +46 -0
  22. cudf_polars/dsl/expressions/sorting.py +85 -0
  23. cudf_polars/dsl/expressions/string.py +1002 -0
  24. cudf_polars/dsl/expressions/struct.py +137 -0
  25. cudf_polars/dsl/expressions/ternary.py +49 -0
  26. cudf_polars/dsl/expressions/unary.py +517 -0
  27. cudf_polars/dsl/ir.py +2607 -0
  28. cudf_polars/dsl/nodebase.py +164 -0
  29. cudf_polars/dsl/to_ast.py +359 -0
  30. cudf_polars/dsl/tracing.py +16 -0
  31. cudf_polars/dsl/translate.py +939 -0
  32. cudf_polars/dsl/traversal.py +224 -0
  33. cudf_polars/dsl/utils/__init__.py +8 -0
  34. cudf_polars/dsl/utils/aggregations.py +481 -0
  35. cudf_polars/dsl/utils/groupby.py +98 -0
  36. cudf_polars/dsl/utils/naming.py +34 -0
  37. cudf_polars/dsl/utils/replace.py +61 -0
  38. cudf_polars/dsl/utils/reshape.py +74 -0
  39. cudf_polars/dsl/utils/rolling.py +121 -0
  40. cudf_polars/dsl/utils/windows.py +192 -0
  41. cudf_polars/experimental/__init__.py +8 -0
  42. cudf_polars/experimental/base.py +386 -0
  43. cudf_polars/experimental/benchmarks/__init__.py +4 -0
  44. cudf_polars/experimental/benchmarks/pdsds.py +220 -0
  45. cudf_polars/experimental/benchmarks/pdsds_queries/__init__.py +4 -0
  46. cudf_polars/experimental/benchmarks/pdsds_queries/q1.py +88 -0
  47. cudf_polars/experimental/benchmarks/pdsds_queries/q10.py +225 -0
  48. cudf_polars/experimental/benchmarks/pdsds_queries/q2.py +244 -0
  49. cudf_polars/experimental/benchmarks/pdsds_queries/q3.py +65 -0
  50. cudf_polars/experimental/benchmarks/pdsds_queries/q4.py +359 -0
  51. cudf_polars/experimental/benchmarks/pdsds_queries/q5.py +462 -0
  52. cudf_polars/experimental/benchmarks/pdsds_queries/q6.py +92 -0
  53. cudf_polars/experimental/benchmarks/pdsds_queries/q7.py +79 -0
  54. cudf_polars/experimental/benchmarks/pdsds_queries/q8.py +524 -0
  55. cudf_polars/experimental/benchmarks/pdsds_queries/q9.py +137 -0
  56. cudf_polars/experimental/benchmarks/pdsh.py +814 -0
  57. cudf_polars/experimental/benchmarks/utils.py +832 -0
  58. cudf_polars/experimental/dask_registers.py +200 -0
  59. cudf_polars/experimental/dispatch.py +156 -0
  60. cudf_polars/experimental/distinct.py +197 -0
  61. cudf_polars/experimental/explain.py +157 -0
  62. cudf_polars/experimental/expressions.py +590 -0
  63. cudf_polars/experimental/groupby.py +327 -0
  64. cudf_polars/experimental/io.py +943 -0
  65. cudf_polars/experimental/join.py +391 -0
  66. cudf_polars/experimental/parallel.py +423 -0
  67. cudf_polars/experimental/repartition.py +69 -0
  68. cudf_polars/experimental/scheduler.py +155 -0
  69. cudf_polars/experimental/select.py +188 -0
  70. cudf_polars/experimental/shuffle.py +354 -0
  71. cudf_polars/experimental/sort.py +609 -0
  72. cudf_polars/experimental/spilling.py +151 -0
  73. cudf_polars/experimental/statistics.py +795 -0
  74. cudf_polars/experimental/utils.py +169 -0
  75. cudf_polars/py.typed +0 -0
  76. cudf_polars/testing/__init__.py +8 -0
  77. cudf_polars/testing/asserts.py +448 -0
  78. cudf_polars/testing/io.py +122 -0
  79. cudf_polars/testing/plugin.py +236 -0
  80. cudf_polars/typing/__init__.py +219 -0
  81. cudf_polars/utils/__init__.py +8 -0
  82. cudf_polars/utils/config.py +741 -0
  83. cudf_polars/utils/conversion.py +40 -0
  84. cudf_polars/utils/dtypes.py +118 -0
  85. cudf_polars/utils/sorting.py +53 -0
  86. cudf_polars/utils/timer.py +39 -0
  87. cudf_polars/utils/versions.py +27 -0
  88. cudf_polars_cu13-25.10.0.dist-info/METADATA +136 -0
  89. cudf_polars_cu13-25.10.0.dist-info/RECORD +92 -0
  90. cudf_polars_cu13-25.10.0.dist-info/WHEEL +5 -0
  91. cudf_polars_cu13-25.10.0.dist-info/licenses/LICENSE +201 -0
  92. cudf_polars_cu13-25.10.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,326 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ # TODO: Document BooleanFunction to remove noqa
4
+ # ruff: noqa: D101
5
+ """Boolean DSL nodes."""
6
+
7
+ from __future__ import annotations
8
+
9
+ from enum import IntEnum, auto
10
+ from functools import partial, reduce
11
+ from typing import TYPE_CHECKING, Any, ClassVar
12
+
13
+ import pylibcudf as plc
14
+
15
+ from cudf_polars.containers import Column, DataType
16
+ from cudf_polars.dsl.expressions.base import (
17
+ ExecutionContext,
18
+ Expr,
19
+ )
20
+
21
+ if TYPE_CHECKING:
22
+ from typing_extensions import Self
23
+
24
+ import polars.type_aliases as pl_types
25
+ from polars.polars import _expr_nodes as pl_expr
26
+
27
+ from cudf_polars.containers import DataFrame
28
+
29
+ __all__ = ["BooleanFunction"]
30
+
31
+
32
+ class BooleanFunction(Expr):
33
+ class Name(IntEnum):
34
+ """Internal and picklable representation of polars' `BooleanFunction`."""
35
+
36
+ All = auto()
37
+ AllHorizontal = auto()
38
+ Any = auto()
39
+ AnyHorizontal = auto()
40
+ IsBetween = auto()
41
+ IsClose = auto()
42
+ IsDuplicated = auto()
43
+ IsFinite = auto()
44
+ IsFirstDistinct = auto()
45
+ IsIn = auto()
46
+ IsInfinite = auto()
47
+ IsLastDistinct = auto()
48
+ IsNan = auto()
49
+ IsNotNan = auto()
50
+ IsNotNull = auto()
51
+ IsNull = auto()
52
+ IsUnique = auto()
53
+ Not = auto()
54
+
55
+ @classmethod
56
+ def from_polars(cls, obj: pl_expr.BooleanFunction) -> Self:
57
+ """Convert from polars' `BooleanFunction`."""
58
+ try:
59
+ function, name = str(obj).split(".", maxsplit=1)
60
+ except ValueError:
61
+ # Failed to unpack string
62
+ function = None
63
+ if function != "BooleanFunction":
64
+ raise ValueError("BooleanFunction required")
65
+ return getattr(cls, name)
66
+
67
+ __slots__ = ("name", "options")
68
+ _non_child = ("dtype", "name", "options")
69
+
70
+ def __init__(
71
+ self,
72
+ dtype: DataType,
73
+ name: BooleanFunction.Name,
74
+ options: tuple[Any, ...],
75
+ *children: Expr,
76
+ ) -> None:
77
+ self.dtype = dtype
78
+ self.options = options
79
+ self.name = name
80
+ self.children = children
81
+ self.is_pointwise = self.name not in (
82
+ BooleanFunction.Name.All,
83
+ BooleanFunction.Name.Any,
84
+ BooleanFunction.Name.IsDuplicated,
85
+ BooleanFunction.Name.IsFirstDistinct,
86
+ BooleanFunction.Name.IsLastDistinct,
87
+ BooleanFunction.Name.IsUnique,
88
+ )
89
+ if self.name in {
90
+ BooleanFunction.Name.IsClose,
91
+ }:
92
+ raise NotImplementedError(
93
+ f"Boolean function {self.name}"
94
+ ) # pragma: no cover
95
+
96
+ @staticmethod
97
+ def _distinct(
98
+ column: Column,
99
+ dtype: DataType,
100
+ *,
101
+ keep: plc.stream_compaction.DuplicateKeepOption,
102
+ source_value: plc.Scalar,
103
+ target_value: plc.Scalar,
104
+ ) -> Column:
105
+ table = plc.Table([column.obj])
106
+ indices = plc.stream_compaction.distinct_indices(
107
+ table,
108
+ keep,
109
+ # TODO: polars doesn't expose options for these
110
+ plc.types.NullEquality.EQUAL,
111
+ plc.types.NanEquality.ALL_EQUAL,
112
+ )
113
+ return Column(
114
+ plc.copying.scatter(
115
+ [source_value],
116
+ indices,
117
+ plc.Table([plc.Column.from_scalar(target_value, table.num_rows())]),
118
+ ).columns()[0],
119
+ dtype=dtype,
120
+ )
121
+
122
+ _BETWEEN_OPS: ClassVar[
123
+ dict[
124
+ pl_types.ClosedInterval,
125
+ tuple[plc.binaryop.BinaryOperator, plc.binaryop.BinaryOperator],
126
+ ]
127
+ ] = {
128
+ "none": (
129
+ plc.binaryop.BinaryOperator.GREATER,
130
+ plc.binaryop.BinaryOperator.LESS,
131
+ ),
132
+ "left": (
133
+ plc.binaryop.BinaryOperator.GREATER_EQUAL,
134
+ plc.binaryop.BinaryOperator.LESS,
135
+ ),
136
+ "right": (
137
+ plc.binaryop.BinaryOperator.GREATER,
138
+ plc.binaryop.BinaryOperator.LESS_EQUAL,
139
+ ),
140
+ "both": (
141
+ plc.binaryop.BinaryOperator.GREATER_EQUAL,
142
+ plc.binaryop.BinaryOperator.LESS_EQUAL,
143
+ ),
144
+ }
145
+
146
+ def do_evaluate(
147
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
148
+ ) -> Column:
149
+ """Evaluate this expression given a dataframe for context."""
150
+ if self.name in (
151
+ BooleanFunction.Name.IsFinite,
152
+ BooleanFunction.Name.IsInfinite,
153
+ ):
154
+ # Avoid evaluating the child if the dtype tells us it's unnecessary.
155
+ (child,) = self.children
156
+ needles = child.evaluate(df, context=context)
157
+ is_float = needles.obj.type().id() in (
158
+ plc.TypeId.FLOAT32,
159
+ plc.TypeId.FLOAT64,
160
+ )
161
+ is_finite = self.name is BooleanFunction.Name.IsFinite
162
+ if not is_float:
163
+ base = plc.Column.from_scalar(
164
+ plc.Scalar.from_py(py_val=is_finite), needles.size
165
+ )
166
+ out = base.with_mask(needles.obj.null_mask(), needles.null_count)
167
+ return Column(out, dtype=self.dtype)
168
+ to_search = [-float("inf"), float("inf")]
169
+ if is_finite:
170
+ # NaN is neither finite not infinite
171
+ to_search.append(float("nan"))
172
+ haystack = plc.Column.from_iterable_of_py(
173
+ to_search,
174
+ dtype=needles.obj.type(),
175
+ )
176
+ result = plc.search.contains(haystack, needles.obj)
177
+ if is_finite:
178
+ result = plc.unary.unary_operation(result, plc.unary.UnaryOperator.NOT)
179
+ return Column(
180
+ result.with_mask(needles.obj.null_mask(), needles.null_count),
181
+ dtype=self.dtype,
182
+ )
183
+ columns = [child.evaluate(df, context=context) for child in self.children]
184
+ # Kleene logic for Any (OR) and All (AND) if ignore_nulls is
185
+ # False
186
+ if self.name in (BooleanFunction.Name.Any, BooleanFunction.Name.All):
187
+ (ignore_nulls,) = self.options
188
+ (column,) = columns
189
+ is_any = self.name is BooleanFunction.Name.Any
190
+ agg = plc.aggregation.any() if is_any else plc.aggregation.all()
191
+ result = plc.reduce.reduce(column.obj, agg, self.dtype.plc)
192
+ if not ignore_nulls and column.null_count > 0:
193
+ # Truth tables
194
+ # Any All
195
+ # | F U T | F U T
196
+ # --+------ --+------
197
+ # F | F U T F | F F F
198
+ # U | U U T U | F U U
199
+ # T | T T T T | F U T
200
+ #
201
+ # If the input null count was non-zero, we must
202
+ # post-process the result to insert the correct value.
203
+ h_result = result.to_py()
204
+ if (is_any and not h_result) or (not is_any and h_result):
205
+ # Any All
206
+ # False || Null => Null True && Null => Null
207
+ return Column(
208
+ plc.Column.all_null_like(column.obj, 1), dtype=self.dtype
209
+ )
210
+ return Column(plc.Column.from_scalar(result, 1), dtype=self.dtype)
211
+ if self.name is BooleanFunction.Name.IsNull:
212
+ (column,) = columns
213
+ return Column(plc.unary.is_null(column.obj), dtype=self.dtype)
214
+ elif self.name is BooleanFunction.Name.IsNotNull:
215
+ (column,) = columns
216
+ return Column(plc.unary.is_valid(column.obj), dtype=self.dtype)
217
+ elif self.name in (BooleanFunction.Name.IsNan, BooleanFunction.Name.IsNotNan):
218
+ (column,) = columns
219
+ is_float = column.obj.type().id() in (
220
+ plc.TypeId.FLOAT32,
221
+ plc.TypeId.FLOAT64,
222
+ )
223
+ if is_float:
224
+ op = (
225
+ plc.unary.is_nan
226
+ if self.name is BooleanFunction.Name.IsNan
227
+ else plc.unary.is_not_nan
228
+ )
229
+ base = op(column.obj)
230
+ else:
231
+ base = plc.Column.from_scalar(
232
+ plc.Scalar.from_py(
233
+ py_val=self.name is not BooleanFunction.Name.IsNan
234
+ ),
235
+ column.size,
236
+ )
237
+ out = base.with_mask(column.obj.null_mask(), column.null_count)
238
+ return Column(out, dtype=self.dtype)
239
+ elif self.name is BooleanFunction.Name.IsFirstDistinct:
240
+ (column,) = columns
241
+ return self._distinct(
242
+ column,
243
+ dtype=self.dtype,
244
+ keep=plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
245
+ source_value=plc.Scalar.from_py(py_val=True, dtype=self.dtype.plc),
246
+ target_value=plc.Scalar.from_py(py_val=False, dtype=self.dtype.plc),
247
+ )
248
+ elif self.name is BooleanFunction.Name.IsLastDistinct:
249
+ (column,) = columns
250
+ return self._distinct(
251
+ column,
252
+ dtype=self.dtype,
253
+ keep=plc.stream_compaction.DuplicateKeepOption.KEEP_LAST,
254
+ source_value=plc.Scalar.from_py(py_val=True, dtype=self.dtype.plc),
255
+ target_value=plc.Scalar.from_py(py_val=False, dtype=self.dtype.plc),
256
+ )
257
+ elif self.name is BooleanFunction.Name.IsUnique:
258
+ (column,) = columns
259
+ return self._distinct(
260
+ column,
261
+ dtype=self.dtype,
262
+ keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
263
+ source_value=plc.Scalar.from_py(py_val=True, dtype=self.dtype.plc),
264
+ target_value=plc.Scalar.from_py(py_val=False, dtype=self.dtype.plc),
265
+ )
266
+ elif self.name is BooleanFunction.Name.IsDuplicated:
267
+ (column,) = columns
268
+ return self._distinct(
269
+ column,
270
+ dtype=self.dtype,
271
+ keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
272
+ source_value=plc.Scalar.from_py(py_val=False, dtype=self.dtype.plc),
273
+ target_value=plc.Scalar.from_py(py_val=True, dtype=self.dtype.plc),
274
+ )
275
+ elif self.name is BooleanFunction.Name.AllHorizontal:
276
+ return Column(
277
+ reduce(
278
+ partial(
279
+ plc.binaryop.binary_operation,
280
+ op=plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
281
+ output_type=self.dtype.plc,
282
+ ),
283
+ (c.obj for c in columns),
284
+ ),
285
+ dtype=self.dtype,
286
+ )
287
+ elif self.name is BooleanFunction.Name.AnyHorizontal:
288
+ return Column(
289
+ reduce(
290
+ partial(
291
+ plc.binaryop.binary_operation,
292
+ op=plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
293
+ output_type=self.dtype.plc,
294
+ ),
295
+ (c.obj for c in columns),
296
+ ),
297
+ dtype=self.dtype,
298
+ )
299
+ elif self.name is BooleanFunction.Name.IsIn:
300
+ needles, haystack = columns
301
+ if haystack.obj.type().id() == plc.TypeId.LIST:
302
+ # Unwrap values from the list column
303
+ # the type: ignore is safe because we know that the type ID is LIST,
304
+ # which always has an inner attribute.
305
+ haystack = Column(
306
+ haystack.obj.children()[1],
307
+ dtype=DataType(haystack.dtype.polars.inner), # type: ignore[attr-defined]
308
+ ).astype(needles.dtype)
309
+ if haystack.size:
310
+ return Column(
311
+ plc.search.contains(haystack.obj, needles.obj), dtype=self.dtype
312
+ )
313
+ return Column(
314
+ plc.Column.from_scalar(plc.Scalar.from_py(py_val=False), needles.size),
315
+ dtype=self.dtype,
316
+ )
317
+ elif self.name is BooleanFunction.Name.Not:
318
+ (column,) = columns
319
+ return Column(
320
+ plc.unary.unary_operation(column.obj, plc.unary.UnaryOperator.NOT),
321
+ dtype=self.dtype,
322
+ )
323
+ else:
324
+ raise NotImplementedError(
325
+ f"BooleanFunction {self.name}"
326
+ ) # pragma: no cover; handled by init raising
@@ -0,0 +1,271 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ # TODO: Document TemporalFunction to remove noqa
4
+ # ruff: noqa: D101
5
+ """DSL nodes for datetime operations."""
6
+
7
+ from __future__ import annotations
8
+
9
+ from enum import IntEnum, auto
10
+ from typing import TYPE_CHECKING, Any, ClassVar
11
+
12
+ import pylibcudf as plc
13
+
14
+ from cudf_polars.containers import Column
15
+ from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
16
+
17
+ if TYPE_CHECKING:
18
+ from typing_extensions import Self
19
+
20
+ from polars.polars import _expr_nodes as pl_expr
21
+
22
+ from cudf_polars.containers import DataFrame, DataType
23
+
24
+ __all__ = ["TemporalFunction"]
25
+
26
+
27
+ class TemporalFunction(Expr):
28
+ class Name(IntEnum):
29
+ """Internal and picklable representation of polars' `TemporalFunction`."""
30
+
31
+ BaseUtcOffset = auto()
32
+ CastTimeUnit = auto()
33
+ Century = auto()
34
+ Combine = auto()
35
+ ConvertTimeZone = auto()
36
+ DSTOffset = auto()
37
+ Date = auto()
38
+ Datetime = auto()
39
+ DatetimeFunction = auto()
40
+ Day = auto()
41
+ DaysInMonth = auto()
42
+ Duration = auto()
43
+ Hour = auto()
44
+ IsLeapYear = auto()
45
+ IsoYear = auto()
46
+ Microsecond = auto()
47
+ Millennium = auto()
48
+ Millisecond = auto()
49
+ Minute = auto()
50
+ Month = auto()
51
+ MonthEnd = auto()
52
+ MonthStart = auto()
53
+ Nanosecond = auto()
54
+ OffsetBy = auto()
55
+ OrdinalDay = auto()
56
+ Quarter = auto()
57
+ Replace = auto()
58
+ ReplaceTimeZone = auto()
59
+ Round = auto()
60
+ Second = auto()
61
+ Time = auto()
62
+ TimeStamp = auto()
63
+ ToString = auto()
64
+ TotalDays = auto()
65
+ TotalHours = auto()
66
+ TotalMicroseconds = auto()
67
+ TotalMilliseconds = auto()
68
+ TotalMinutes = auto()
69
+ TotalNanoseconds = auto()
70
+ TotalSeconds = auto()
71
+ Truncate = auto()
72
+ Week = auto()
73
+ WeekDay = auto()
74
+ WithTimeUnit = auto()
75
+ Year = auto()
76
+
77
+ @classmethod
78
+ def from_polars(cls, obj: pl_expr.TemporalFunction) -> Self:
79
+ """Convert from polars' `TemporalFunction`."""
80
+ try:
81
+ function, name = str(obj).split(".", maxsplit=1)
82
+ except ValueError:
83
+ # Failed to unpack string
84
+ function = None
85
+ if function != "TemporalFunction":
86
+ raise ValueError("TemporalFunction required")
87
+ return getattr(cls, name)
88
+
89
+ __slots__ = ("name", "options")
90
+ _non_child = ("dtype", "name", "options")
91
+ _COMPONENT_MAP: ClassVar[dict[Name, plc.datetime.DatetimeComponent]] = {
92
+ Name.Year: plc.datetime.DatetimeComponent.YEAR,
93
+ Name.Month: plc.datetime.DatetimeComponent.MONTH,
94
+ Name.Day: plc.datetime.DatetimeComponent.DAY,
95
+ Name.WeekDay: plc.datetime.DatetimeComponent.WEEKDAY,
96
+ Name.Hour: plc.datetime.DatetimeComponent.HOUR,
97
+ Name.Minute: plc.datetime.DatetimeComponent.MINUTE,
98
+ Name.Second: plc.datetime.DatetimeComponent.SECOND,
99
+ Name.Millisecond: plc.datetime.DatetimeComponent.MILLISECOND,
100
+ Name.Microsecond: plc.datetime.DatetimeComponent.MICROSECOND,
101
+ Name.Nanosecond: plc.datetime.DatetimeComponent.NANOSECOND,
102
+ }
103
+
104
+ _valid_ops: ClassVar[set[Name]] = {
105
+ *_COMPONENT_MAP.keys(),
106
+ Name.IsLeapYear,
107
+ Name.OrdinalDay,
108
+ Name.ToString,
109
+ Name.Week,
110
+ Name.IsoYear,
111
+ Name.MonthStart,
112
+ Name.MonthEnd,
113
+ Name.CastTimeUnit,
114
+ }
115
+
116
+ def __init__(
117
+ self,
118
+ dtype: DataType,
119
+ name: TemporalFunction.Name,
120
+ options: tuple[Any, ...],
121
+ *children: Expr,
122
+ ) -> None:
123
+ self.dtype = dtype
124
+ self.options = options
125
+ self.name = name
126
+ self.children = children
127
+ self.is_pointwise = True
128
+ if self.name not in self._valid_ops:
129
+ raise NotImplementedError(f"Temporal function {self.name}")
130
+
131
+ if self.name is TemporalFunction.Name.ToString and plc.traits.is_duration(
132
+ self.children[0].dtype.plc
133
+ ):
134
+ raise NotImplementedError("ToString is not supported on duration types")
135
+
136
+ def do_evaluate(
137
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
138
+ ) -> Column:
139
+ """Evaluate this expression given a dataframe for context."""
140
+ columns = [child.evaluate(df, context=context) for child in self.children]
141
+ (column,) = columns
142
+ if self.name is TemporalFunction.Name.CastTimeUnit:
143
+ return Column(plc.unary.cast(column.obj, self.dtype.plc), dtype=self.dtype)
144
+ if self.name == TemporalFunction.Name.ToString:
145
+ return Column(
146
+ plc.strings.convert.convert_datetime.from_timestamps(
147
+ column.obj,
148
+ self.options[0],
149
+ plc.Column.from_iterable_of_py([], dtype=self.dtype.plc),
150
+ ),
151
+ dtype=self.dtype,
152
+ )
153
+ if self.name is TemporalFunction.Name.Week:
154
+ result = plc.strings.convert.convert_integers.to_integers(
155
+ plc.strings.convert.convert_datetime.from_timestamps(
156
+ column.obj,
157
+ format="%V",
158
+ input_strings_names=plc.Column.from_iterable_of_py(
159
+ [], dtype=plc.DataType(plc.TypeId.STRING)
160
+ ),
161
+ ),
162
+ self.dtype.plc,
163
+ )
164
+ return Column(result, dtype=self.dtype)
165
+ if self.name is TemporalFunction.Name.IsoYear:
166
+ result = plc.strings.convert.convert_integers.to_integers(
167
+ plc.strings.convert.convert_datetime.from_timestamps(
168
+ column.obj,
169
+ format="%G",
170
+ input_strings_names=plc.Column.from_iterable_of_py(
171
+ [], dtype=plc.DataType(plc.TypeId.STRING)
172
+ ),
173
+ ),
174
+ self.dtype.plc,
175
+ )
176
+ return Column(result, dtype=self.dtype)
177
+ if self.name is TemporalFunction.Name.MonthStart:
178
+ ends = plc.datetime.last_day_of_month(column.obj)
179
+ days_to_subtract = plc.datetime.days_in_month(column.obj)
180
+ # must subtract 1 to avoid rolling over to the previous month
181
+ days_to_subtract = plc.binaryop.binary_operation(
182
+ days_to_subtract,
183
+ plc.Scalar.from_py(1, plc.DataType(plc.TypeId.INT32)),
184
+ plc.binaryop.BinaryOperator.SUB,
185
+ plc.DataType(plc.TypeId.DURATION_DAYS),
186
+ )
187
+ result = plc.binaryop.binary_operation(
188
+ ends,
189
+ days_to_subtract,
190
+ plc.binaryop.BinaryOperator.SUB,
191
+ self.dtype.plc,
192
+ )
193
+
194
+ return Column(result, dtype=self.dtype)
195
+ if self.name is TemporalFunction.Name.MonthEnd:
196
+ return Column(
197
+ plc.unary.cast(
198
+ plc.datetime.last_day_of_month(column.obj), self.dtype.plc
199
+ ),
200
+ dtype=self.dtype,
201
+ )
202
+ if self.name is TemporalFunction.Name.IsLeapYear:
203
+ return Column(
204
+ plc.datetime.is_leap_year(column.obj),
205
+ dtype=self.dtype,
206
+ )
207
+ if self.name is TemporalFunction.Name.OrdinalDay:
208
+ return Column(plc.datetime.day_of_year(column.obj), dtype=self.dtype)
209
+ if self.name is TemporalFunction.Name.Microsecond:
210
+ millis = plc.datetime.extract_datetime_component(
211
+ column.obj, plc.datetime.DatetimeComponent.MILLISECOND
212
+ )
213
+ micros = plc.datetime.extract_datetime_component(
214
+ column.obj, plc.datetime.DatetimeComponent.MICROSECOND
215
+ )
216
+ millis_as_micros = plc.binaryop.binary_operation(
217
+ millis,
218
+ plc.Scalar.from_py(1_000, plc.DataType(plc.TypeId.INT32)),
219
+ plc.binaryop.BinaryOperator.MUL,
220
+ self.dtype.plc,
221
+ )
222
+ total_micros = plc.binaryop.binary_operation(
223
+ micros,
224
+ millis_as_micros,
225
+ plc.binaryop.BinaryOperator.ADD,
226
+ self.dtype.plc,
227
+ )
228
+ return Column(total_micros, dtype=self.dtype)
229
+ elif self.name is TemporalFunction.Name.Nanosecond:
230
+ millis = plc.datetime.extract_datetime_component(
231
+ column.obj, plc.datetime.DatetimeComponent.MILLISECOND
232
+ )
233
+ micros = plc.datetime.extract_datetime_component(
234
+ column.obj, plc.datetime.DatetimeComponent.MICROSECOND
235
+ )
236
+ nanos = plc.datetime.extract_datetime_component(
237
+ column.obj, plc.datetime.DatetimeComponent.NANOSECOND
238
+ )
239
+ millis_as_nanos = plc.binaryop.binary_operation(
240
+ millis,
241
+ plc.Scalar.from_py(1_000_000, plc.DataType(plc.TypeId.INT32)),
242
+ plc.binaryop.BinaryOperator.MUL,
243
+ self.dtype.plc,
244
+ )
245
+ micros_as_nanos = plc.binaryop.binary_operation(
246
+ micros,
247
+ plc.Scalar.from_py(1_000, plc.DataType(plc.TypeId.INT32)),
248
+ plc.binaryop.BinaryOperator.MUL,
249
+ self.dtype.plc,
250
+ )
251
+ total_nanos = plc.binaryop.binary_operation(
252
+ nanos,
253
+ millis_as_nanos,
254
+ plc.binaryop.BinaryOperator.ADD,
255
+ self.dtype.plc,
256
+ )
257
+ total_nanos = plc.binaryop.binary_operation(
258
+ total_nanos,
259
+ micros_as_nanos,
260
+ plc.binaryop.BinaryOperator.ADD,
261
+ self.dtype.plc,
262
+ )
263
+ return Column(total_nanos, dtype=self.dtype)
264
+
265
+ return Column(
266
+ plc.datetime.extract_datetime_component(
267
+ column.obj,
268
+ self._COMPONENT_MAP[self.name],
269
+ ),
270
+ dtype=self.dtype,
271
+ )
@@ -0,0 +1,97 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ # TODO: remove need for this
4
+ # ruff: noqa: D101
5
+ """Literal DSL nodes."""
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import TYPE_CHECKING, Any, NoReturn
10
+
11
+ import polars as pl
12
+
13
+ import pylibcudf as plc
14
+
15
+ from cudf_polars.containers import Column, DataType
16
+ from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
17
+
18
+ if TYPE_CHECKING:
19
+ from collections.abc import Hashable
20
+
21
+ from cudf_polars.containers import DataFrame
22
+
23
+ __all__ = ["Literal", "LiteralColumn"]
24
+
25
+
26
+ class Literal(Expr):
27
+ __slots__ = ("value",)
28
+ _non_child = ("dtype", "value")
29
+ value: Any # Python scalar
30
+
31
+ def __init__(self, dtype: DataType, value: Any) -> None:
32
+ if value is None and dtype.id() == plc.TypeId.EMPTY:
33
+ # TypeId.EMPTY not supported by libcudf
34
+ # cuDF Python also maps EMPTY to INT8
35
+ dtype = DataType(pl.datatypes.Int8())
36
+ self.dtype = dtype
37
+ self.value = value
38
+ self.children = ()
39
+ self.is_pointwise = True
40
+
41
+ def do_evaluate(
42
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
43
+ ) -> Column:
44
+ """Evaluate this expression given a dataframe for context."""
45
+ return Column(
46
+ plc.Column.from_scalar(plc.Scalar.from_py(self.value, self.dtype.plc), 1),
47
+ dtype=self.dtype,
48
+ )
49
+
50
+ @property
51
+ def agg_request(self) -> NoReturn: # noqa: D102
52
+ raise NotImplementedError(
53
+ "Not expecting to require agg request of literal"
54
+ ) # pragma: no cover
55
+
56
+ def astype(self, dtype: DataType) -> Literal:
57
+ """Cast self to dtype."""
58
+ if self.value is None:
59
+ return Literal(dtype, self.value)
60
+ else:
61
+ # Use polars to cast instead of pylibcudf
62
+ # since there are just Python scalars
63
+ casted = pl.Series(values=[self.value], dtype=self.dtype.polars).cast(
64
+ dtype.polars
65
+ )[0]
66
+ return Literal(dtype, casted)
67
+
68
+
69
+ class LiteralColumn(Expr):
70
+ __slots__ = ("value",)
71
+ _non_child = ("dtype", "value")
72
+ value: pl.Series
73
+
74
+ def __init__(self, dtype: DataType, value: pl.Series) -> None:
75
+ self.dtype = dtype
76
+ self.value = value
77
+ self.children = ()
78
+ self.is_pointwise = True
79
+
80
+ def get_hashable(self) -> Hashable:
81
+ """Compute a hash of the column."""
82
+ # This is stricter than necessary, but we only need this hash
83
+ # for identity in groupby replacements so it's OK. And this
84
+ # way we avoid doing potentially expensive compute.
85
+ return (type(self), self.dtype.plc, id(self.value))
86
+
87
+ def do_evaluate(
88
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
89
+ ) -> Column:
90
+ """Evaluate this expression given a dataframe for context."""
91
+ return Column(plc.Column.from_arrow(self.value), dtype=self.dtype)
92
+
93
+ @property
94
+ def agg_request(self) -> NoReturn: # noqa: D102
95
+ raise NotImplementedError(
96
+ "Not expecting to require agg request of literal"
97
+ ) # pragma: no cover