cudf-polars-cu13 25.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudf_polars/GIT_COMMIT +1 -0
- cudf_polars/VERSION +1 -0
- cudf_polars/__init__.py +28 -0
- cudf_polars/_version.py +21 -0
- cudf_polars/callback.py +318 -0
- cudf_polars/containers/__init__.py +13 -0
- cudf_polars/containers/column.py +495 -0
- cudf_polars/containers/dataframe.py +361 -0
- cudf_polars/containers/datatype.py +137 -0
- cudf_polars/dsl/__init__.py +8 -0
- cudf_polars/dsl/expr.py +66 -0
- cudf_polars/dsl/expressions/__init__.py +8 -0
- cudf_polars/dsl/expressions/aggregation.py +226 -0
- cudf_polars/dsl/expressions/base.py +272 -0
- cudf_polars/dsl/expressions/binaryop.py +120 -0
- cudf_polars/dsl/expressions/boolean.py +326 -0
- cudf_polars/dsl/expressions/datetime.py +271 -0
- cudf_polars/dsl/expressions/literal.py +97 -0
- cudf_polars/dsl/expressions/rolling.py +643 -0
- cudf_polars/dsl/expressions/selection.py +74 -0
- cudf_polars/dsl/expressions/slicing.py +46 -0
- cudf_polars/dsl/expressions/sorting.py +85 -0
- cudf_polars/dsl/expressions/string.py +1002 -0
- cudf_polars/dsl/expressions/struct.py +137 -0
- cudf_polars/dsl/expressions/ternary.py +49 -0
- cudf_polars/dsl/expressions/unary.py +517 -0
- cudf_polars/dsl/ir.py +2607 -0
- cudf_polars/dsl/nodebase.py +164 -0
- cudf_polars/dsl/to_ast.py +359 -0
- cudf_polars/dsl/tracing.py +16 -0
- cudf_polars/dsl/translate.py +939 -0
- cudf_polars/dsl/traversal.py +224 -0
- cudf_polars/dsl/utils/__init__.py +8 -0
- cudf_polars/dsl/utils/aggregations.py +481 -0
- cudf_polars/dsl/utils/groupby.py +98 -0
- cudf_polars/dsl/utils/naming.py +34 -0
- cudf_polars/dsl/utils/replace.py +61 -0
- cudf_polars/dsl/utils/reshape.py +74 -0
- cudf_polars/dsl/utils/rolling.py +121 -0
- cudf_polars/dsl/utils/windows.py +192 -0
- cudf_polars/experimental/__init__.py +8 -0
- cudf_polars/experimental/base.py +386 -0
- cudf_polars/experimental/benchmarks/__init__.py +4 -0
- cudf_polars/experimental/benchmarks/pdsds.py +220 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/__init__.py +4 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q1.py +88 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q10.py +225 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q2.py +244 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q3.py +65 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q4.py +359 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q5.py +462 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q6.py +92 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q7.py +79 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q8.py +524 -0
- cudf_polars/experimental/benchmarks/pdsds_queries/q9.py +137 -0
- cudf_polars/experimental/benchmarks/pdsh.py +814 -0
- cudf_polars/experimental/benchmarks/utils.py +832 -0
- cudf_polars/experimental/dask_registers.py +200 -0
- cudf_polars/experimental/dispatch.py +156 -0
- cudf_polars/experimental/distinct.py +197 -0
- cudf_polars/experimental/explain.py +157 -0
- cudf_polars/experimental/expressions.py +590 -0
- cudf_polars/experimental/groupby.py +327 -0
- cudf_polars/experimental/io.py +943 -0
- cudf_polars/experimental/join.py +391 -0
- cudf_polars/experimental/parallel.py +423 -0
- cudf_polars/experimental/repartition.py +69 -0
- cudf_polars/experimental/scheduler.py +155 -0
- cudf_polars/experimental/select.py +188 -0
- cudf_polars/experimental/shuffle.py +354 -0
- cudf_polars/experimental/sort.py +609 -0
- cudf_polars/experimental/spilling.py +151 -0
- cudf_polars/experimental/statistics.py +795 -0
- cudf_polars/experimental/utils.py +169 -0
- cudf_polars/py.typed +0 -0
- cudf_polars/testing/__init__.py +8 -0
- cudf_polars/testing/asserts.py +448 -0
- cudf_polars/testing/io.py +122 -0
- cudf_polars/testing/plugin.py +236 -0
- cudf_polars/typing/__init__.py +219 -0
- cudf_polars/utils/__init__.py +8 -0
- cudf_polars/utils/config.py +741 -0
- cudf_polars/utils/conversion.py +40 -0
- cudf_polars/utils/dtypes.py +118 -0
- cudf_polars/utils/sorting.py +53 -0
- cudf_polars/utils/timer.py +39 -0
- cudf_polars/utils/versions.py +27 -0
- cudf_polars_cu13-25.10.0.dist-info/METADATA +136 -0
- cudf_polars_cu13-25.10.0.dist-info/RECORD +92 -0
- cudf_polars_cu13-25.10.0.dist-info/WHEEL +5 -0
- cudf_polars_cu13-25.10.0.dist-info/licenses/LICENSE +201 -0
- cudf_polars_cu13-25.10.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,327 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
"""Parallel GroupBy Logic."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import itertools
|
|
8
|
+
import math
|
|
9
|
+
from typing import TYPE_CHECKING
|
|
10
|
+
|
|
11
|
+
import polars as pl
|
|
12
|
+
|
|
13
|
+
import pylibcudf as plc
|
|
14
|
+
|
|
15
|
+
from cudf_polars.containers import DataType
|
|
16
|
+
from cudf_polars.dsl.expr import Agg, BinOp, Col, Len, NamedExpr
|
|
17
|
+
from cudf_polars.dsl.ir import GroupBy, Select, Slice
|
|
18
|
+
from cudf_polars.dsl.traversal import traversal
|
|
19
|
+
from cudf_polars.dsl.utils.naming import unique_names
|
|
20
|
+
from cudf_polars.experimental.base import PartitionInfo
|
|
21
|
+
from cudf_polars.experimental.dispatch import lower_ir_node
|
|
22
|
+
from cudf_polars.experimental.repartition import Repartition
|
|
23
|
+
from cudf_polars.experimental.shuffle import Shuffle
|
|
24
|
+
from cudf_polars.experimental.utils import _get_unique_fractions, _lower_ir_fallback
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from collections.abc import Generator, MutableMapping
|
|
28
|
+
|
|
29
|
+
from cudf_polars.dsl.expr import Expr
|
|
30
|
+
from cudf_polars.dsl.ir import IR
|
|
31
|
+
from cudf_polars.experimental.parallel import LowerIRTransformer
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# Supported multi-partition aggregations
|
|
35
|
+
_GB_AGG_SUPPORTED = ("sum", "count", "mean", "min", "max", "n_unique")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def combine(
|
|
39
|
+
*decompositions: tuple[NamedExpr, list[NamedExpr], list[NamedExpr], bool],
|
|
40
|
+
) -> tuple[list[NamedExpr], list[NamedExpr], list[NamedExpr], bool]:
|
|
41
|
+
"""
|
|
42
|
+
Combine multiple groupby-aggregation decompositions.
|
|
43
|
+
|
|
44
|
+
Parameters
|
|
45
|
+
----------
|
|
46
|
+
decompositions
|
|
47
|
+
Packed sequence of `decompose` results.
|
|
48
|
+
|
|
49
|
+
Returns
|
|
50
|
+
-------
|
|
51
|
+
Unified groupby-aggregation decomposition.
|
|
52
|
+
"""
|
|
53
|
+
if len(decompositions) == 0:
|
|
54
|
+
return [], [], [], False
|
|
55
|
+
selections, aggregations, reductions, need_preshuffles = zip(
|
|
56
|
+
*decompositions, strict=True
|
|
57
|
+
)
|
|
58
|
+
assert all(isinstance(ne, NamedExpr) for ne in selections)
|
|
59
|
+
return (
|
|
60
|
+
list(selections),
|
|
61
|
+
list(itertools.chain.from_iterable(aggregations)),
|
|
62
|
+
list(itertools.chain.from_iterable(reductions)),
|
|
63
|
+
any(need_preshuffles),
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def decompose(
|
|
68
|
+
name: str, expr: Expr, *, names: Generator[str, None, None]
|
|
69
|
+
) -> tuple[NamedExpr, list[NamedExpr], list[NamedExpr], bool]:
|
|
70
|
+
"""
|
|
71
|
+
Decompose a groupby-aggregation expression.
|
|
72
|
+
|
|
73
|
+
Parameters
|
|
74
|
+
----------
|
|
75
|
+
name
|
|
76
|
+
Output schema name.
|
|
77
|
+
expr
|
|
78
|
+
The aggregation expression for a single column.
|
|
79
|
+
names
|
|
80
|
+
Generator of unique names for temporaries.
|
|
81
|
+
|
|
82
|
+
Returns
|
|
83
|
+
-------
|
|
84
|
+
NamedExpr
|
|
85
|
+
The expression selecting the *output* column or columns.
|
|
86
|
+
list[NamedExpr]
|
|
87
|
+
The initial aggregation expressions.
|
|
88
|
+
list[NamedExpr]
|
|
89
|
+
The reduction expressions.
|
|
90
|
+
bool
|
|
91
|
+
Whether we need to pre-shuffle on the group_by keys.
|
|
92
|
+
"""
|
|
93
|
+
dtype = expr.dtype
|
|
94
|
+
|
|
95
|
+
if isinstance(expr, Len):
|
|
96
|
+
selection = NamedExpr(name, Col(dtype, name))
|
|
97
|
+
aggregation = [NamedExpr(name, expr)]
|
|
98
|
+
reduction = [NamedExpr(name, Agg(dtype, "sum", None, Col(dtype, name)))]
|
|
99
|
+
return selection, aggregation, reduction, False
|
|
100
|
+
if isinstance(expr, Agg):
|
|
101
|
+
if expr.name in ("sum", "count", "min", "max", "n_unique"):
|
|
102
|
+
if expr.name in ("sum", "count", "n_unique"):
|
|
103
|
+
aggfunc = "sum"
|
|
104
|
+
else:
|
|
105
|
+
aggfunc = expr.name
|
|
106
|
+
selection = NamedExpr(name, Col(dtype, name))
|
|
107
|
+
aggregation = [NamedExpr(name, expr)]
|
|
108
|
+
reduction = [NamedExpr(name, Agg(dtype, aggfunc, None, Col(dtype, name)))]
|
|
109
|
+
return selection, aggregation, reduction, expr.name == "n_unique"
|
|
110
|
+
elif expr.name == "mean":
|
|
111
|
+
(child,) = expr.children
|
|
112
|
+
(sum, count), aggregations, reductions, need_preshuffle = combine(
|
|
113
|
+
decompose(
|
|
114
|
+
f"{next(names)}__mean_sum",
|
|
115
|
+
Agg(dtype, "sum", None, child),
|
|
116
|
+
names=names,
|
|
117
|
+
),
|
|
118
|
+
decompose(
|
|
119
|
+
f"{next(names)}__mean_count",
|
|
120
|
+
Agg(DataType(pl.Int32()), "count", False, child), # noqa: FBT003
|
|
121
|
+
names=names,
|
|
122
|
+
),
|
|
123
|
+
)
|
|
124
|
+
selection = NamedExpr(
|
|
125
|
+
name,
|
|
126
|
+
BinOp(dtype, plc.binaryop.BinaryOperator.DIV, sum.value, count.value),
|
|
127
|
+
)
|
|
128
|
+
return selection, aggregations, reductions, need_preshuffle
|
|
129
|
+
else:
|
|
130
|
+
raise NotImplementedError(
|
|
131
|
+
"group_by does not support multiple partitions "
|
|
132
|
+
f"for this aggregation type:\n{type(expr)}\n"
|
|
133
|
+
f"Only {_GB_AGG_SUPPORTED} are supported."
|
|
134
|
+
)
|
|
135
|
+
else: # pragma: no cover
|
|
136
|
+
# Unsupported expression
|
|
137
|
+
raise NotImplementedError(
|
|
138
|
+
f"GroupBy does not support multiple partitions for this expression:\n{expr}"
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
@lower_ir_node.register(GroupBy)
|
|
143
|
+
def _(
|
|
144
|
+
ir: GroupBy, rec: LowerIRTransformer
|
|
145
|
+
) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
|
|
146
|
+
# Pull slice operations out of the GroupBy before lowering
|
|
147
|
+
if ir.zlice is not None:
|
|
148
|
+
offset, length = ir.zlice
|
|
149
|
+
if length is None: # pragma: no cover
|
|
150
|
+
return _lower_ir_fallback(
|
|
151
|
+
ir,
|
|
152
|
+
rec,
|
|
153
|
+
msg="This slice not supported for multiple partitions.",
|
|
154
|
+
)
|
|
155
|
+
new_join = GroupBy(
|
|
156
|
+
ir.schema,
|
|
157
|
+
ir.keys,
|
|
158
|
+
ir.agg_requests,
|
|
159
|
+
ir.maintain_order,
|
|
160
|
+
None,
|
|
161
|
+
*ir.children,
|
|
162
|
+
)
|
|
163
|
+
return rec(Slice(ir.schema, offset, length, new_join))
|
|
164
|
+
|
|
165
|
+
# Extract child partitioning
|
|
166
|
+
original_child = ir.children[0]
|
|
167
|
+
child, partition_info = rec(ir.children[0])
|
|
168
|
+
|
|
169
|
+
# Handle single-partition case
|
|
170
|
+
if partition_info[child].count == 1:
|
|
171
|
+
single_part_node = ir.reconstruct([child])
|
|
172
|
+
partition_info[single_part_node] = partition_info[child]
|
|
173
|
+
return single_part_node, partition_info
|
|
174
|
+
|
|
175
|
+
# Check group-by keys
|
|
176
|
+
if not all(
|
|
177
|
+
expr.is_pointwise for expr in traversal([e.value for e in ir.keys])
|
|
178
|
+
): # pragma: no cover
|
|
179
|
+
return _lower_ir_fallback(
|
|
180
|
+
ir,
|
|
181
|
+
rec,
|
|
182
|
+
msg="group_by does not support multiple partitions for non-pointwise keys.",
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
# Check if we are dealing with any high-cardinality columns
|
|
186
|
+
post_aggregation_count = 1 # Default tree reduction
|
|
187
|
+
groupby_key_columns = [ne.name for ne in ir.keys]
|
|
188
|
+
shuffled = partition_info[child].partitioned_on == ir.keys
|
|
189
|
+
|
|
190
|
+
config_options = rec.state["config_options"]
|
|
191
|
+
assert config_options.executor.name == "streaming", (
|
|
192
|
+
"'in-memory' executor not supported in 'lower_ir_node'"
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
child_count = partition_info[child].count
|
|
196
|
+
if unique_fraction_dict := _get_unique_fractions(
|
|
197
|
+
groupby_key_columns,
|
|
198
|
+
config_options.executor.unique_fraction,
|
|
199
|
+
row_count=rec.state["stats"].row_count.get(original_child),
|
|
200
|
+
column_stats=rec.state["stats"].column_stats.get(original_child),
|
|
201
|
+
):
|
|
202
|
+
# Use unique_fraction to determine output partitioning
|
|
203
|
+
unique_fraction = max(unique_fraction_dict.values())
|
|
204
|
+
post_aggregation_count = max(int(unique_fraction * child_count), 1)
|
|
205
|
+
|
|
206
|
+
new_node: IR
|
|
207
|
+
name_generator = unique_names(ir.schema.keys())
|
|
208
|
+
# Decompose the aggregation requests into three distinct phases
|
|
209
|
+
try:
|
|
210
|
+
selection_exprs, piecewise_exprs, reduction_exprs, need_preshuffle = combine(
|
|
211
|
+
*(
|
|
212
|
+
decompose(agg.name, agg.value, names=name_generator)
|
|
213
|
+
for agg in ir.agg_requests
|
|
214
|
+
)
|
|
215
|
+
)
|
|
216
|
+
except NotImplementedError:
|
|
217
|
+
if shuffled: # pragma: no cover
|
|
218
|
+
# Don't fallback if we are already shuffled.
|
|
219
|
+
# We can just preserve the child's partitioning
|
|
220
|
+
new_node = ir.reconstruct([child])
|
|
221
|
+
partition_info[new_node] = partition_info[child]
|
|
222
|
+
return new_node, partition_info
|
|
223
|
+
return _lower_ir_fallback(
|
|
224
|
+
ir, rec, msg="Failed to decompose groupby aggs for multiple partitions."
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
# Preshuffle ir.child if needed
|
|
228
|
+
if need_preshuffle:
|
|
229
|
+
child = Shuffle(
|
|
230
|
+
child.schema,
|
|
231
|
+
ir.keys,
|
|
232
|
+
config_options.executor.shuffle_method,
|
|
233
|
+
child,
|
|
234
|
+
)
|
|
235
|
+
partition_info[child] = PartitionInfo(
|
|
236
|
+
count=child_count,
|
|
237
|
+
partitioned_on=ir.keys,
|
|
238
|
+
)
|
|
239
|
+
shuffled = True
|
|
240
|
+
|
|
241
|
+
# Partition-wise groupby operation
|
|
242
|
+
pwise_schema = {k.name: k.value.dtype for k in ir.keys} | {
|
|
243
|
+
k.name: k.value.dtype for k in piecewise_exprs
|
|
244
|
+
}
|
|
245
|
+
gb_pwise = GroupBy(
|
|
246
|
+
pwise_schema,
|
|
247
|
+
ir.keys,
|
|
248
|
+
piecewise_exprs,
|
|
249
|
+
ir.maintain_order,
|
|
250
|
+
None,
|
|
251
|
+
child,
|
|
252
|
+
)
|
|
253
|
+
child_count = partition_info[child].count
|
|
254
|
+
partition_info[gb_pwise] = PartitionInfo(count=child_count)
|
|
255
|
+
grouped_keys = tuple(NamedExpr(k.name, Col(k.value.dtype, k.name)) for k in ir.keys)
|
|
256
|
+
|
|
257
|
+
# Reduction
|
|
258
|
+
gb_inter: GroupBy | Repartition | Shuffle
|
|
259
|
+
reduction_schema = {k.name: k.value.dtype for k in grouped_keys} | {
|
|
260
|
+
k.name: k.value.dtype for k in reduction_exprs
|
|
261
|
+
}
|
|
262
|
+
if not shuffled and post_aggregation_count > 1:
|
|
263
|
+
# Shuffle reduction
|
|
264
|
+
if ir.maintain_order: # pragma: no cover
|
|
265
|
+
return _lower_ir_fallback(
|
|
266
|
+
ir,
|
|
267
|
+
rec,
|
|
268
|
+
msg="maintain_order not supported for multiple output partitions.",
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
gb_inter = Shuffle(
|
|
272
|
+
gb_pwise.schema,
|
|
273
|
+
grouped_keys,
|
|
274
|
+
config_options.executor.shuffle_method,
|
|
275
|
+
gb_pwise,
|
|
276
|
+
)
|
|
277
|
+
partition_info[gb_inter] = PartitionInfo(count=post_aggregation_count)
|
|
278
|
+
else:
|
|
279
|
+
# N-ary tree reduction
|
|
280
|
+
assert config_options.executor.name == "streaming", (
|
|
281
|
+
"'in-memory' executor not supported in 'generate_ir_tasks'"
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
n_ary = config_options.executor.groupby_n_ary
|
|
285
|
+
count = child_count
|
|
286
|
+
gb_inter = gb_pwise
|
|
287
|
+
while count > post_aggregation_count:
|
|
288
|
+
gb_inter = Repartition(gb_inter.schema, gb_inter)
|
|
289
|
+
count = max(math.ceil(count / n_ary), post_aggregation_count)
|
|
290
|
+
partition_info[gb_inter] = PartitionInfo(count=count)
|
|
291
|
+
if count > post_aggregation_count:
|
|
292
|
+
gb_inter = GroupBy(
|
|
293
|
+
reduction_schema,
|
|
294
|
+
grouped_keys,
|
|
295
|
+
reduction_exprs,
|
|
296
|
+
ir.maintain_order,
|
|
297
|
+
None,
|
|
298
|
+
gb_inter,
|
|
299
|
+
)
|
|
300
|
+
partition_info[gb_inter] = PartitionInfo(count=count)
|
|
301
|
+
|
|
302
|
+
# Final aggregation
|
|
303
|
+
gb_reduce = GroupBy(
|
|
304
|
+
reduction_schema,
|
|
305
|
+
grouped_keys,
|
|
306
|
+
reduction_exprs,
|
|
307
|
+
ir.maintain_order,
|
|
308
|
+
ir.zlice,
|
|
309
|
+
gb_inter,
|
|
310
|
+
)
|
|
311
|
+
partition_info[gb_reduce] = PartitionInfo(count=post_aggregation_count)
|
|
312
|
+
|
|
313
|
+
# Final Select phase
|
|
314
|
+
new_node = Select(
|
|
315
|
+
ir.schema,
|
|
316
|
+
[
|
|
317
|
+
*(NamedExpr(k.name, Col(k.value.dtype, k.name)) for k in grouped_keys),
|
|
318
|
+
*selection_exprs,
|
|
319
|
+
],
|
|
320
|
+
False, # noqa: FBT003
|
|
321
|
+
gb_reduce,
|
|
322
|
+
)
|
|
323
|
+
partition_info[new_node] = PartitionInfo(
|
|
324
|
+
count=post_aggregation_count,
|
|
325
|
+
partitioned_on=grouped_keys,
|
|
326
|
+
)
|
|
327
|
+
return new_node, partition_info
|