cudf-polars-cu13 25.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. cudf_polars/GIT_COMMIT +1 -0
  2. cudf_polars/VERSION +1 -0
  3. cudf_polars/__init__.py +28 -0
  4. cudf_polars/_version.py +21 -0
  5. cudf_polars/callback.py +318 -0
  6. cudf_polars/containers/__init__.py +13 -0
  7. cudf_polars/containers/column.py +495 -0
  8. cudf_polars/containers/dataframe.py +361 -0
  9. cudf_polars/containers/datatype.py +137 -0
  10. cudf_polars/dsl/__init__.py +8 -0
  11. cudf_polars/dsl/expr.py +66 -0
  12. cudf_polars/dsl/expressions/__init__.py +8 -0
  13. cudf_polars/dsl/expressions/aggregation.py +226 -0
  14. cudf_polars/dsl/expressions/base.py +272 -0
  15. cudf_polars/dsl/expressions/binaryop.py +120 -0
  16. cudf_polars/dsl/expressions/boolean.py +326 -0
  17. cudf_polars/dsl/expressions/datetime.py +271 -0
  18. cudf_polars/dsl/expressions/literal.py +97 -0
  19. cudf_polars/dsl/expressions/rolling.py +643 -0
  20. cudf_polars/dsl/expressions/selection.py +74 -0
  21. cudf_polars/dsl/expressions/slicing.py +46 -0
  22. cudf_polars/dsl/expressions/sorting.py +85 -0
  23. cudf_polars/dsl/expressions/string.py +1002 -0
  24. cudf_polars/dsl/expressions/struct.py +137 -0
  25. cudf_polars/dsl/expressions/ternary.py +49 -0
  26. cudf_polars/dsl/expressions/unary.py +517 -0
  27. cudf_polars/dsl/ir.py +2607 -0
  28. cudf_polars/dsl/nodebase.py +164 -0
  29. cudf_polars/dsl/to_ast.py +359 -0
  30. cudf_polars/dsl/tracing.py +16 -0
  31. cudf_polars/dsl/translate.py +939 -0
  32. cudf_polars/dsl/traversal.py +224 -0
  33. cudf_polars/dsl/utils/__init__.py +8 -0
  34. cudf_polars/dsl/utils/aggregations.py +481 -0
  35. cudf_polars/dsl/utils/groupby.py +98 -0
  36. cudf_polars/dsl/utils/naming.py +34 -0
  37. cudf_polars/dsl/utils/replace.py +61 -0
  38. cudf_polars/dsl/utils/reshape.py +74 -0
  39. cudf_polars/dsl/utils/rolling.py +121 -0
  40. cudf_polars/dsl/utils/windows.py +192 -0
  41. cudf_polars/experimental/__init__.py +8 -0
  42. cudf_polars/experimental/base.py +386 -0
  43. cudf_polars/experimental/benchmarks/__init__.py +4 -0
  44. cudf_polars/experimental/benchmarks/pdsds.py +220 -0
  45. cudf_polars/experimental/benchmarks/pdsds_queries/__init__.py +4 -0
  46. cudf_polars/experimental/benchmarks/pdsds_queries/q1.py +88 -0
  47. cudf_polars/experimental/benchmarks/pdsds_queries/q10.py +225 -0
  48. cudf_polars/experimental/benchmarks/pdsds_queries/q2.py +244 -0
  49. cudf_polars/experimental/benchmarks/pdsds_queries/q3.py +65 -0
  50. cudf_polars/experimental/benchmarks/pdsds_queries/q4.py +359 -0
  51. cudf_polars/experimental/benchmarks/pdsds_queries/q5.py +462 -0
  52. cudf_polars/experimental/benchmarks/pdsds_queries/q6.py +92 -0
  53. cudf_polars/experimental/benchmarks/pdsds_queries/q7.py +79 -0
  54. cudf_polars/experimental/benchmarks/pdsds_queries/q8.py +524 -0
  55. cudf_polars/experimental/benchmarks/pdsds_queries/q9.py +137 -0
  56. cudf_polars/experimental/benchmarks/pdsh.py +814 -0
  57. cudf_polars/experimental/benchmarks/utils.py +832 -0
  58. cudf_polars/experimental/dask_registers.py +200 -0
  59. cudf_polars/experimental/dispatch.py +156 -0
  60. cudf_polars/experimental/distinct.py +197 -0
  61. cudf_polars/experimental/explain.py +157 -0
  62. cudf_polars/experimental/expressions.py +590 -0
  63. cudf_polars/experimental/groupby.py +327 -0
  64. cudf_polars/experimental/io.py +943 -0
  65. cudf_polars/experimental/join.py +391 -0
  66. cudf_polars/experimental/parallel.py +423 -0
  67. cudf_polars/experimental/repartition.py +69 -0
  68. cudf_polars/experimental/scheduler.py +155 -0
  69. cudf_polars/experimental/select.py +188 -0
  70. cudf_polars/experimental/shuffle.py +354 -0
  71. cudf_polars/experimental/sort.py +609 -0
  72. cudf_polars/experimental/spilling.py +151 -0
  73. cudf_polars/experimental/statistics.py +795 -0
  74. cudf_polars/experimental/utils.py +169 -0
  75. cudf_polars/py.typed +0 -0
  76. cudf_polars/testing/__init__.py +8 -0
  77. cudf_polars/testing/asserts.py +448 -0
  78. cudf_polars/testing/io.py +122 -0
  79. cudf_polars/testing/plugin.py +236 -0
  80. cudf_polars/typing/__init__.py +219 -0
  81. cudf_polars/utils/__init__.py +8 -0
  82. cudf_polars/utils/config.py +741 -0
  83. cudf_polars/utils/conversion.py +40 -0
  84. cudf_polars/utils/dtypes.py +118 -0
  85. cudf_polars/utils/sorting.py +53 -0
  86. cudf_polars/utils/timer.py +39 -0
  87. cudf_polars/utils/versions.py +27 -0
  88. cudf_polars_cu13-25.10.0.dist-info/METADATA +136 -0
  89. cudf_polars_cu13-25.10.0.dist-info/RECORD +92 -0
  90. cudf_polars_cu13-25.10.0.dist-info/WHEEL +5 -0
  91. cudf_polars_cu13-25.10.0.dist-info/licenses/LICENSE +201 -0
  92. cudf_polars_cu13-25.10.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,327 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ """Parallel GroupBy Logic."""
4
+
5
+ from __future__ import annotations
6
+
7
+ import itertools
8
+ import math
9
+ from typing import TYPE_CHECKING
10
+
11
+ import polars as pl
12
+
13
+ import pylibcudf as plc
14
+
15
+ from cudf_polars.containers import DataType
16
+ from cudf_polars.dsl.expr import Agg, BinOp, Col, Len, NamedExpr
17
+ from cudf_polars.dsl.ir import GroupBy, Select, Slice
18
+ from cudf_polars.dsl.traversal import traversal
19
+ from cudf_polars.dsl.utils.naming import unique_names
20
+ from cudf_polars.experimental.base import PartitionInfo
21
+ from cudf_polars.experimental.dispatch import lower_ir_node
22
+ from cudf_polars.experimental.repartition import Repartition
23
+ from cudf_polars.experimental.shuffle import Shuffle
24
+ from cudf_polars.experimental.utils import _get_unique_fractions, _lower_ir_fallback
25
+
26
+ if TYPE_CHECKING:
27
+ from collections.abc import Generator, MutableMapping
28
+
29
+ from cudf_polars.dsl.expr import Expr
30
+ from cudf_polars.dsl.ir import IR
31
+ from cudf_polars.experimental.parallel import LowerIRTransformer
32
+
33
+
34
+ # Supported multi-partition aggregations
35
+ _GB_AGG_SUPPORTED = ("sum", "count", "mean", "min", "max", "n_unique")
36
+
37
+
38
+ def combine(
39
+ *decompositions: tuple[NamedExpr, list[NamedExpr], list[NamedExpr], bool],
40
+ ) -> tuple[list[NamedExpr], list[NamedExpr], list[NamedExpr], bool]:
41
+ """
42
+ Combine multiple groupby-aggregation decompositions.
43
+
44
+ Parameters
45
+ ----------
46
+ decompositions
47
+ Packed sequence of `decompose` results.
48
+
49
+ Returns
50
+ -------
51
+ Unified groupby-aggregation decomposition.
52
+ """
53
+ if len(decompositions) == 0:
54
+ return [], [], [], False
55
+ selections, aggregations, reductions, need_preshuffles = zip(
56
+ *decompositions, strict=True
57
+ )
58
+ assert all(isinstance(ne, NamedExpr) for ne in selections)
59
+ return (
60
+ list(selections),
61
+ list(itertools.chain.from_iterable(aggregations)),
62
+ list(itertools.chain.from_iterable(reductions)),
63
+ any(need_preshuffles),
64
+ )
65
+
66
+
67
+ def decompose(
68
+ name: str, expr: Expr, *, names: Generator[str, None, None]
69
+ ) -> tuple[NamedExpr, list[NamedExpr], list[NamedExpr], bool]:
70
+ """
71
+ Decompose a groupby-aggregation expression.
72
+
73
+ Parameters
74
+ ----------
75
+ name
76
+ Output schema name.
77
+ expr
78
+ The aggregation expression for a single column.
79
+ names
80
+ Generator of unique names for temporaries.
81
+
82
+ Returns
83
+ -------
84
+ NamedExpr
85
+ The expression selecting the *output* column or columns.
86
+ list[NamedExpr]
87
+ The initial aggregation expressions.
88
+ list[NamedExpr]
89
+ The reduction expressions.
90
+ bool
91
+ Whether we need to pre-shuffle on the group_by keys.
92
+ """
93
+ dtype = expr.dtype
94
+
95
+ if isinstance(expr, Len):
96
+ selection = NamedExpr(name, Col(dtype, name))
97
+ aggregation = [NamedExpr(name, expr)]
98
+ reduction = [NamedExpr(name, Agg(dtype, "sum", None, Col(dtype, name)))]
99
+ return selection, aggregation, reduction, False
100
+ if isinstance(expr, Agg):
101
+ if expr.name in ("sum", "count", "min", "max", "n_unique"):
102
+ if expr.name in ("sum", "count", "n_unique"):
103
+ aggfunc = "sum"
104
+ else:
105
+ aggfunc = expr.name
106
+ selection = NamedExpr(name, Col(dtype, name))
107
+ aggregation = [NamedExpr(name, expr)]
108
+ reduction = [NamedExpr(name, Agg(dtype, aggfunc, None, Col(dtype, name)))]
109
+ return selection, aggregation, reduction, expr.name == "n_unique"
110
+ elif expr.name == "mean":
111
+ (child,) = expr.children
112
+ (sum, count), aggregations, reductions, need_preshuffle = combine(
113
+ decompose(
114
+ f"{next(names)}__mean_sum",
115
+ Agg(dtype, "sum", None, child),
116
+ names=names,
117
+ ),
118
+ decompose(
119
+ f"{next(names)}__mean_count",
120
+ Agg(DataType(pl.Int32()), "count", False, child), # noqa: FBT003
121
+ names=names,
122
+ ),
123
+ )
124
+ selection = NamedExpr(
125
+ name,
126
+ BinOp(dtype, plc.binaryop.BinaryOperator.DIV, sum.value, count.value),
127
+ )
128
+ return selection, aggregations, reductions, need_preshuffle
129
+ else:
130
+ raise NotImplementedError(
131
+ "group_by does not support multiple partitions "
132
+ f"for this aggregation type:\n{type(expr)}\n"
133
+ f"Only {_GB_AGG_SUPPORTED} are supported."
134
+ )
135
+ else: # pragma: no cover
136
+ # Unsupported expression
137
+ raise NotImplementedError(
138
+ f"GroupBy does not support multiple partitions for this expression:\n{expr}"
139
+ )
140
+
141
+
142
+ @lower_ir_node.register(GroupBy)
143
+ def _(
144
+ ir: GroupBy, rec: LowerIRTransformer
145
+ ) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
146
+ # Pull slice operations out of the GroupBy before lowering
147
+ if ir.zlice is not None:
148
+ offset, length = ir.zlice
149
+ if length is None: # pragma: no cover
150
+ return _lower_ir_fallback(
151
+ ir,
152
+ rec,
153
+ msg="This slice not supported for multiple partitions.",
154
+ )
155
+ new_join = GroupBy(
156
+ ir.schema,
157
+ ir.keys,
158
+ ir.agg_requests,
159
+ ir.maintain_order,
160
+ None,
161
+ *ir.children,
162
+ )
163
+ return rec(Slice(ir.schema, offset, length, new_join))
164
+
165
+ # Extract child partitioning
166
+ original_child = ir.children[0]
167
+ child, partition_info = rec(ir.children[0])
168
+
169
+ # Handle single-partition case
170
+ if partition_info[child].count == 1:
171
+ single_part_node = ir.reconstruct([child])
172
+ partition_info[single_part_node] = partition_info[child]
173
+ return single_part_node, partition_info
174
+
175
+ # Check group-by keys
176
+ if not all(
177
+ expr.is_pointwise for expr in traversal([e.value for e in ir.keys])
178
+ ): # pragma: no cover
179
+ return _lower_ir_fallback(
180
+ ir,
181
+ rec,
182
+ msg="group_by does not support multiple partitions for non-pointwise keys.",
183
+ )
184
+
185
+ # Check if we are dealing with any high-cardinality columns
186
+ post_aggregation_count = 1 # Default tree reduction
187
+ groupby_key_columns = [ne.name for ne in ir.keys]
188
+ shuffled = partition_info[child].partitioned_on == ir.keys
189
+
190
+ config_options = rec.state["config_options"]
191
+ assert config_options.executor.name == "streaming", (
192
+ "'in-memory' executor not supported in 'lower_ir_node'"
193
+ )
194
+
195
+ child_count = partition_info[child].count
196
+ if unique_fraction_dict := _get_unique_fractions(
197
+ groupby_key_columns,
198
+ config_options.executor.unique_fraction,
199
+ row_count=rec.state["stats"].row_count.get(original_child),
200
+ column_stats=rec.state["stats"].column_stats.get(original_child),
201
+ ):
202
+ # Use unique_fraction to determine output partitioning
203
+ unique_fraction = max(unique_fraction_dict.values())
204
+ post_aggregation_count = max(int(unique_fraction * child_count), 1)
205
+
206
+ new_node: IR
207
+ name_generator = unique_names(ir.schema.keys())
208
+ # Decompose the aggregation requests into three distinct phases
209
+ try:
210
+ selection_exprs, piecewise_exprs, reduction_exprs, need_preshuffle = combine(
211
+ *(
212
+ decompose(agg.name, agg.value, names=name_generator)
213
+ for agg in ir.agg_requests
214
+ )
215
+ )
216
+ except NotImplementedError:
217
+ if shuffled: # pragma: no cover
218
+ # Don't fallback if we are already shuffled.
219
+ # We can just preserve the child's partitioning
220
+ new_node = ir.reconstruct([child])
221
+ partition_info[new_node] = partition_info[child]
222
+ return new_node, partition_info
223
+ return _lower_ir_fallback(
224
+ ir, rec, msg="Failed to decompose groupby aggs for multiple partitions."
225
+ )
226
+
227
+ # Preshuffle ir.child if needed
228
+ if need_preshuffle:
229
+ child = Shuffle(
230
+ child.schema,
231
+ ir.keys,
232
+ config_options.executor.shuffle_method,
233
+ child,
234
+ )
235
+ partition_info[child] = PartitionInfo(
236
+ count=child_count,
237
+ partitioned_on=ir.keys,
238
+ )
239
+ shuffled = True
240
+
241
+ # Partition-wise groupby operation
242
+ pwise_schema = {k.name: k.value.dtype for k in ir.keys} | {
243
+ k.name: k.value.dtype for k in piecewise_exprs
244
+ }
245
+ gb_pwise = GroupBy(
246
+ pwise_schema,
247
+ ir.keys,
248
+ piecewise_exprs,
249
+ ir.maintain_order,
250
+ None,
251
+ child,
252
+ )
253
+ child_count = partition_info[child].count
254
+ partition_info[gb_pwise] = PartitionInfo(count=child_count)
255
+ grouped_keys = tuple(NamedExpr(k.name, Col(k.value.dtype, k.name)) for k in ir.keys)
256
+
257
+ # Reduction
258
+ gb_inter: GroupBy | Repartition | Shuffle
259
+ reduction_schema = {k.name: k.value.dtype for k in grouped_keys} | {
260
+ k.name: k.value.dtype for k in reduction_exprs
261
+ }
262
+ if not shuffled and post_aggregation_count > 1:
263
+ # Shuffle reduction
264
+ if ir.maintain_order: # pragma: no cover
265
+ return _lower_ir_fallback(
266
+ ir,
267
+ rec,
268
+ msg="maintain_order not supported for multiple output partitions.",
269
+ )
270
+
271
+ gb_inter = Shuffle(
272
+ gb_pwise.schema,
273
+ grouped_keys,
274
+ config_options.executor.shuffle_method,
275
+ gb_pwise,
276
+ )
277
+ partition_info[gb_inter] = PartitionInfo(count=post_aggregation_count)
278
+ else:
279
+ # N-ary tree reduction
280
+ assert config_options.executor.name == "streaming", (
281
+ "'in-memory' executor not supported in 'generate_ir_tasks'"
282
+ )
283
+
284
+ n_ary = config_options.executor.groupby_n_ary
285
+ count = child_count
286
+ gb_inter = gb_pwise
287
+ while count > post_aggregation_count:
288
+ gb_inter = Repartition(gb_inter.schema, gb_inter)
289
+ count = max(math.ceil(count / n_ary), post_aggregation_count)
290
+ partition_info[gb_inter] = PartitionInfo(count=count)
291
+ if count > post_aggregation_count:
292
+ gb_inter = GroupBy(
293
+ reduction_schema,
294
+ grouped_keys,
295
+ reduction_exprs,
296
+ ir.maintain_order,
297
+ None,
298
+ gb_inter,
299
+ )
300
+ partition_info[gb_inter] = PartitionInfo(count=count)
301
+
302
+ # Final aggregation
303
+ gb_reduce = GroupBy(
304
+ reduction_schema,
305
+ grouped_keys,
306
+ reduction_exprs,
307
+ ir.maintain_order,
308
+ ir.zlice,
309
+ gb_inter,
310
+ )
311
+ partition_info[gb_reduce] = PartitionInfo(count=post_aggregation_count)
312
+
313
+ # Final Select phase
314
+ new_node = Select(
315
+ ir.schema,
316
+ [
317
+ *(NamedExpr(k.name, Col(k.value.dtype, k.name)) for k in grouped_keys),
318
+ *selection_exprs,
319
+ ],
320
+ False, # noqa: FBT003
321
+ gb_reduce,
322
+ )
323
+ partition_info[new_node] = PartitionInfo(
324
+ count=post_aggregation_count,
325
+ partitioned_on=grouped_keys,
326
+ )
327
+ return new_node, partition_info