cudf-polars-cu12 25.2.2__py3-none-any.whl → 25.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. cudf_polars/VERSION +1 -1
  2. cudf_polars/callback.py +82 -65
  3. cudf_polars/containers/column.py +138 -7
  4. cudf_polars/containers/dataframe.py +26 -39
  5. cudf_polars/dsl/expr.py +3 -1
  6. cudf_polars/dsl/expressions/aggregation.py +27 -63
  7. cudf_polars/dsl/expressions/base.py +40 -72
  8. cudf_polars/dsl/expressions/binaryop.py +5 -41
  9. cudf_polars/dsl/expressions/boolean.py +25 -53
  10. cudf_polars/dsl/expressions/datetime.py +97 -17
  11. cudf_polars/dsl/expressions/literal.py +27 -33
  12. cudf_polars/dsl/expressions/rolling.py +110 -9
  13. cudf_polars/dsl/expressions/selection.py +8 -26
  14. cudf_polars/dsl/expressions/slicing.py +47 -0
  15. cudf_polars/dsl/expressions/sorting.py +5 -18
  16. cudf_polars/dsl/expressions/string.py +33 -36
  17. cudf_polars/dsl/expressions/ternary.py +3 -10
  18. cudf_polars/dsl/expressions/unary.py +35 -75
  19. cudf_polars/dsl/ir.py +749 -212
  20. cudf_polars/dsl/nodebase.py +8 -1
  21. cudf_polars/dsl/to_ast.py +5 -3
  22. cudf_polars/dsl/translate.py +319 -171
  23. cudf_polars/dsl/utils/__init__.py +8 -0
  24. cudf_polars/dsl/utils/aggregations.py +292 -0
  25. cudf_polars/dsl/utils/groupby.py +97 -0
  26. cudf_polars/dsl/utils/naming.py +34 -0
  27. cudf_polars/dsl/utils/replace.py +46 -0
  28. cudf_polars/dsl/utils/rolling.py +113 -0
  29. cudf_polars/dsl/utils/windows.py +186 -0
  30. cudf_polars/experimental/base.py +17 -19
  31. cudf_polars/experimental/benchmarks/__init__.py +4 -0
  32. cudf_polars/experimental/benchmarks/pdsh.py +1279 -0
  33. cudf_polars/experimental/dask_registers.py +196 -0
  34. cudf_polars/experimental/distinct.py +174 -0
  35. cudf_polars/experimental/explain.py +127 -0
  36. cudf_polars/experimental/expressions.py +521 -0
  37. cudf_polars/experimental/groupby.py +288 -0
  38. cudf_polars/experimental/io.py +58 -29
  39. cudf_polars/experimental/join.py +353 -0
  40. cudf_polars/experimental/parallel.py +166 -93
  41. cudf_polars/experimental/repartition.py +69 -0
  42. cudf_polars/experimental/scheduler.py +155 -0
  43. cudf_polars/experimental/select.py +92 -7
  44. cudf_polars/experimental/shuffle.py +294 -0
  45. cudf_polars/experimental/sort.py +45 -0
  46. cudf_polars/experimental/spilling.py +151 -0
  47. cudf_polars/experimental/utils.py +100 -0
  48. cudf_polars/testing/asserts.py +146 -6
  49. cudf_polars/testing/io.py +72 -0
  50. cudf_polars/testing/plugin.py +78 -76
  51. cudf_polars/typing/__init__.py +59 -6
  52. cudf_polars/utils/config.py +353 -0
  53. cudf_polars/utils/conversion.py +40 -0
  54. cudf_polars/utils/dtypes.py +22 -5
  55. cudf_polars/utils/timer.py +39 -0
  56. cudf_polars/utils/versions.py +5 -4
  57. {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/METADATA +10 -7
  58. cudf_polars_cu12-25.6.0.dist-info/RECORD +73 -0
  59. {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/WHEEL +1 -1
  60. cudf_polars/experimental/dask_serialize.py +0 -59
  61. cudf_polars_cu12-25.2.2.dist-info/RECORD +0 -48
  62. {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info/licenses}/LICENSE +0 -0
  63. {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,186 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ """Utilities for rolling window aggregations."""
5
+
6
+ from __future__ import annotations
7
+
8
+ from typing import TYPE_CHECKING
9
+
10
+ import polars as pl
11
+
12
+ import pylibcudf as plc
13
+
14
+ if TYPE_CHECKING:
15
+ from cudf_polars.typing import ClosedInterval, Duration
16
+
17
+
18
+ __all__ = [
19
+ "duration_to_int",
20
+ "duration_to_scalar",
21
+ "offsets_to_windows",
22
+ "range_window_bounds",
23
+ ]
24
+
25
+
26
+ def duration_to_int(
27
+ dtype: plc.DataType,
28
+ months: int,
29
+ weeks: int,
30
+ days: int,
31
+ nanoseconds: int,
32
+ parsed_int: bool, # noqa: FBT001
33
+ negative: bool, # noqa: FBT001
34
+ ) -> int:
35
+ """
36
+ Convert a polars duration value to an integer.
37
+
38
+ Parameters
39
+ ----------
40
+ dtype
41
+ The type of the column being added to.
42
+ months
43
+ Number of months
44
+ weeks
45
+ Number of weeks
46
+ days
47
+ Number of days
48
+ nanoseconds
49
+ Number of nanoseconds
50
+ parsed_int
51
+ Is this actually a representation of an integer, not a duration?
52
+ negative
53
+ Is this a negative duration?
54
+
55
+ Returns
56
+ -------
57
+ int
58
+ The total number of nanoseconds represented by this duration,
59
+ or just an integer if `parsed_int` was true.
60
+
61
+ Raises
62
+ ------
63
+ NotImplementedError
64
+ For unsupported durations or datatypes.
65
+ """
66
+ if months != 0:
67
+ raise NotImplementedError("Month durations in rolling windows")
68
+ if parsed_int and (weeks != 0 or days != 0 or dtype.id() != plc.TypeId.INT64):
69
+ raise NotImplementedError(
70
+ "Invalid duration for parsed_int"
71
+ ) # pragma: no cover; polars raises first
72
+ elif not parsed_int and dtype.id() == plc.TypeId.INT64:
73
+ raise pl.exceptions.InvalidOperationError("Duration must be a parsed integer")
74
+ value = nanoseconds + 24 * 60 * 60 * 10**9 * (days + 7 * weeks)
75
+ return -value if negative else value
76
+
77
+
78
+ def duration_to_scalar(dtype: plc.DataType, value: int) -> plc.Scalar:
79
+ """
80
+ Convert a raw polars duration value to a pylibcudf scalar.
81
+
82
+ Parameters
83
+ ----------
84
+ dtype
85
+ The type of the column being added to.
86
+ value
87
+ The raw value as in integer. If `dtype` represents a timestamp
88
+ type, this should be in nanoseconds.
89
+
90
+ Returns
91
+ -------
92
+ pylibcudf.Scalar
93
+ With datatype matching the provided dtype.
94
+
95
+ Raises
96
+ ------
97
+ NotImplementedError
98
+ For unsupported durations or datatypes.
99
+ """
100
+ tid = dtype.id()
101
+ if tid == plc.TypeId.INT64:
102
+ return plc.Scalar.from_py(value, dtype)
103
+ elif tid == plc.TypeId.TIMESTAMP_NANOSECONDS:
104
+ return plc.Scalar.from_py(value, plc.DataType(plc.TypeId.DURATION_NANOSECONDS))
105
+ elif tid == plc.TypeId.TIMESTAMP_MICROSECONDS:
106
+ return plc.Scalar.from_py(
107
+ value // 1000, plc.DataType(plc.TypeId.DURATION_MICROSECONDS)
108
+ )
109
+ elif tid == plc.TypeId.TIMESTAMP_MILLISECONDS:
110
+ return plc.Scalar.from_py(
111
+ value // 1_000_000, plc.DataType(plc.TypeId.DURATION_MILLISECONDS)
112
+ )
113
+ else:
114
+ raise NotImplementedError("Unsupported data type in rolling window offset")
115
+
116
+
117
+ def offsets_to_windows(
118
+ dtype: plc.DataType,
119
+ offset: Duration,
120
+ period: Duration,
121
+ ) -> tuple[plc.Scalar, plc.Scalar]:
122
+ """
123
+ Convert polars offset/period pair to preceding/following windows.
124
+
125
+ Parameters
126
+ ----------
127
+ dtype
128
+ Datatype of column defining windows
129
+ offset
130
+ Offset duration
131
+ period
132
+ Period of window
133
+
134
+ Returns
135
+ -------
136
+ tuple of preceding and following windows as pyarrow scalars.
137
+ """
138
+ offset_i = duration_to_int(dtype, *offset)
139
+ period_i = duration_to_int(dtype, *period)
140
+ # Polars uses current_row + offset, ..., current_row + offset + period
141
+ # Libcudf uses current_row - preceding, ..., current_row + following
142
+ return duration_to_scalar(dtype, -offset_i), duration_to_scalar(
143
+ dtype, offset_i + period_i
144
+ )
145
+
146
+
147
+ def range_window_bounds(
148
+ preceding: plc.Scalar, following: plc.Scalar, closed_window: ClosedInterval
149
+ ) -> tuple[plc.rolling.RangeWindowType, plc.rolling.RangeWindowType]:
150
+ """
151
+ Convert preceding and following scalars to range window specs.
152
+
153
+ Parameters
154
+ ----------
155
+ preceding
156
+ The preceding window scalar.
157
+ following
158
+ The following window scalar.
159
+ closed_window
160
+ How the window interval endpoints are treated.
161
+
162
+ Returns
163
+ -------
164
+ tuple
165
+ Of preceding and following range window types.
166
+ """
167
+ if closed_window == "both":
168
+ return (
169
+ plc.rolling.BoundedClosed(preceding),
170
+ plc.rolling.BoundedClosed(following),
171
+ )
172
+ elif closed_window == "left":
173
+ return (
174
+ plc.rolling.BoundedClosed(preceding),
175
+ plc.rolling.BoundedOpen(following),
176
+ )
177
+ elif closed_window == "right":
178
+ return (
179
+ plc.rolling.BoundedOpen(preceding),
180
+ plc.rolling.BoundedClosed(following),
181
+ )
182
+ else:
183
+ return (
184
+ plc.rolling.BoundedOpen(preceding),
185
+ plc.rolling.BoundedOpen(following),
186
+ )
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
  """Multi-partition base classes."""
4
4
 
@@ -6,26 +6,29 @@ from __future__ import annotations
6
6
 
7
7
  from typing import TYPE_CHECKING
8
8
 
9
- from cudf_polars.dsl.ir import Union
10
-
11
9
  if TYPE_CHECKING:
12
- from collections.abc import Iterator, Sequence
10
+ from collections.abc import Iterator
13
11
 
14
- from cudf_polars.containers import DataFrame
12
+ from cudf_polars.dsl.expr import NamedExpr
15
13
  from cudf_polars.dsl.nodebase import Node
16
14
 
17
15
 
18
16
  class PartitionInfo:
19
- """
20
- Partitioning information.
21
-
22
- This class only tracks the partition count (for now).
23
- """
24
-
25
- __slots__ = ("count",)
26
-
27
- def __init__(self, count: int):
17
+ """Partitioning information."""
18
+
19
+ __slots__ = ("count", "partitioned_on")
20
+ count: int
21
+ """Partition count."""
22
+ partitioned_on: tuple[NamedExpr, ...]
23
+ """Columns the data is hash-partitioned on."""
24
+
25
+ def __init__(
26
+ self,
27
+ count: int,
28
+ partitioned_on: tuple[NamedExpr, ...] = (),
29
+ ):
28
30
  self.count = count
31
+ self.partitioned_on = partitioned_on
29
32
 
30
33
  def keys(self, node: Node) -> Iterator[tuple[str, int]]:
31
34
  """Return the partitioned keys for a given node."""
@@ -36,8 +39,3 @@ class PartitionInfo:
36
39
  def get_key_name(node: Node) -> str:
37
40
  """Generate the key name for a Node."""
38
41
  return f"{type(node).__name__.lower()}-{hash(node)}"
39
-
40
-
41
- def _concat(dfs: Sequence[DataFrame]) -> DataFrame:
42
- # Concatenate a sequence of DataFrames vertically
43
- return Union.do_evaluate(None, *dfs)
@@ -0,0 +1,4 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ """Experimental benchmarks."""