cudf-polars-cu13 25.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. cudf_polars/GIT_COMMIT +1 -0
  2. cudf_polars/VERSION +1 -0
  3. cudf_polars/__init__.py +28 -0
  4. cudf_polars/_version.py +21 -0
  5. cudf_polars/callback.py +318 -0
  6. cudf_polars/containers/__init__.py +13 -0
  7. cudf_polars/containers/column.py +495 -0
  8. cudf_polars/containers/dataframe.py +361 -0
  9. cudf_polars/containers/datatype.py +137 -0
  10. cudf_polars/dsl/__init__.py +8 -0
  11. cudf_polars/dsl/expr.py +66 -0
  12. cudf_polars/dsl/expressions/__init__.py +8 -0
  13. cudf_polars/dsl/expressions/aggregation.py +226 -0
  14. cudf_polars/dsl/expressions/base.py +272 -0
  15. cudf_polars/dsl/expressions/binaryop.py +120 -0
  16. cudf_polars/dsl/expressions/boolean.py +326 -0
  17. cudf_polars/dsl/expressions/datetime.py +271 -0
  18. cudf_polars/dsl/expressions/literal.py +97 -0
  19. cudf_polars/dsl/expressions/rolling.py +643 -0
  20. cudf_polars/dsl/expressions/selection.py +74 -0
  21. cudf_polars/dsl/expressions/slicing.py +46 -0
  22. cudf_polars/dsl/expressions/sorting.py +85 -0
  23. cudf_polars/dsl/expressions/string.py +1002 -0
  24. cudf_polars/dsl/expressions/struct.py +137 -0
  25. cudf_polars/dsl/expressions/ternary.py +49 -0
  26. cudf_polars/dsl/expressions/unary.py +517 -0
  27. cudf_polars/dsl/ir.py +2607 -0
  28. cudf_polars/dsl/nodebase.py +164 -0
  29. cudf_polars/dsl/to_ast.py +359 -0
  30. cudf_polars/dsl/tracing.py +16 -0
  31. cudf_polars/dsl/translate.py +939 -0
  32. cudf_polars/dsl/traversal.py +224 -0
  33. cudf_polars/dsl/utils/__init__.py +8 -0
  34. cudf_polars/dsl/utils/aggregations.py +481 -0
  35. cudf_polars/dsl/utils/groupby.py +98 -0
  36. cudf_polars/dsl/utils/naming.py +34 -0
  37. cudf_polars/dsl/utils/replace.py +61 -0
  38. cudf_polars/dsl/utils/reshape.py +74 -0
  39. cudf_polars/dsl/utils/rolling.py +121 -0
  40. cudf_polars/dsl/utils/windows.py +192 -0
  41. cudf_polars/experimental/__init__.py +8 -0
  42. cudf_polars/experimental/base.py +386 -0
  43. cudf_polars/experimental/benchmarks/__init__.py +4 -0
  44. cudf_polars/experimental/benchmarks/pdsds.py +220 -0
  45. cudf_polars/experimental/benchmarks/pdsds_queries/__init__.py +4 -0
  46. cudf_polars/experimental/benchmarks/pdsds_queries/q1.py +88 -0
  47. cudf_polars/experimental/benchmarks/pdsds_queries/q10.py +225 -0
  48. cudf_polars/experimental/benchmarks/pdsds_queries/q2.py +244 -0
  49. cudf_polars/experimental/benchmarks/pdsds_queries/q3.py +65 -0
  50. cudf_polars/experimental/benchmarks/pdsds_queries/q4.py +359 -0
  51. cudf_polars/experimental/benchmarks/pdsds_queries/q5.py +462 -0
  52. cudf_polars/experimental/benchmarks/pdsds_queries/q6.py +92 -0
  53. cudf_polars/experimental/benchmarks/pdsds_queries/q7.py +79 -0
  54. cudf_polars/experimental/benchmarks/pdsds_queries/q8.py +524 -0
  55. cudf_polars/experimental/benchmarks/pdsds_queries/q9.py +137 -0
  56. cudf_polars/experimental/benchmarks/pdsh.py +814 -0
  57. cudf_polars/experimental/benchmarks/utils.py +832 -0
  58. cudf_polars/experimental/dask_registers.py +200 -0
  59. cudf_polars/experimental/dispatch.py +156 -0
  60. cudf_polars/experimental/distinct.py +197 -0
  61. cudf_polars/experimental/explain.py +157 -0
  62. cudf_polars/experimental/expressions.py +590 -0
  63. cudf_polars/experimental/groupby.py +327 -0
  64. cudf_polars/experimental/io.py +943 -0
  65. cudf_polars/experimental/join.py +391 -0
  66. cudf_polars/experimental/parallel.py +423 -0
  67. cudf_polars/experimental/repartition.py +69 -0
  68. cudf_polars/experimental/scheduler.py +155 -0
  69. cudf_polars/experimental/select.py +188 -0
  70. cudf_polars/experimental/shuffle.py +354 -0
  71. cudf_polars/experimental/sort.py +609 -0
  72. cudf_polars/experimental/spilling.py +151 -0
  73. cudf_polars/experimental/statistics.py +795 -0
  74. cudf_polars/experimental/utils.py +169 -0
  75. cudf_polars/py.typed +0 -0
  76. cudf_polars/testing/__init__.py +8 -0
  77. cudf_polars/testing/asserts.py +448 -0
  78. cudf_polars/testing/io.py +122 -0
  79. cudf_polars/testing/plugin.py +236 -0
  80. cudf_polars/typing/__init__.py +219 -0
  81. cudf_polars/utils/__init__.py +8 -0
  82. cudf_polars/utils/config.py +741 -0
  83. cudf_polars/utils/conversion.py +40 -0
  84. cudf_polars/utils/dtypes.py +118 -0
  85. cudf_polars/utils/sorting.py +53 -0
  86. cudf_polars/utils/timer.py +39 -0
  87. cudf_polars/utils/versions.py +27 -0
  88. cudf_polars_cu13-25.10.0.dist-info/METADATA +136 -0
  89. cudf_polars_cu13-25.10.0.dist-info/RECORD +92 -0
  90. cudf_polars_cu13-25.10.0.dist-info/WHEEL +5 -0
  91. cudf_polars_cu13-25.10.0.dist-info/licenses/LICENSE +201 -0
  92. cudf_polars_cu13-25.10.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,391 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ """Parallel Join Logic."""
4
+
5
+ from __future__ import annotations
6
+
7
+ import operator
8
+ from functools import reduce
9
+ from typing import TYPE_CHECKING, Any
10
+
11
+ from cudf_polars.dsl.ir import ConditionalJoin, Join, Slice
12
+ from cudf_polars.experimental.base import PartitionInfo, get_key_name
13
+ from cudf_polars.experimental.dispatch import generate_ir_tasks, lower_ir_node
14
+ from cudf_polars.experimental.repartition import Repartition
15
+ from cudf_polars.experimental.shuffle import Shuffle, _hash_partition_dataframe
16
+ from cudf_polars.experimental.utils import _concat, _fallback_inform, _lower_ir_fallback
17
+
18
+ if TYPE_CHECKING:
19
+ from collections.abc import MutableMapping
20
+
21
+ from cudf_polars.dsl.expr import NamedExpr
22
+ from cudf_polars.dsl.ir import IR
23
+ from cudf_polars.experimental.parallel import LowerIRTransformer
24
+ from cudf_polars.utils.config import ShuffleMethod
25
+
26
+
27
+ def _maybe_shuffle_frame(
28
+ frame: IR,
29
+ on: tuple[NamedExpr, ...],
30
+ partition_info: MutableMapping[IR, PartitionInfo],
31
+ shuffle_method: ShuffleMethod,
32
+ output_count: int,
33
+ ) -> IR:
34
+ # Shuffle `frame` if it isn't already shuffled.
35
+ if (
36
+ partition_info[frame].partitioned_on == on
37
+ and partition_info[frame].count == output_count
38
+ ):
39
+ # Already shuffled
40
+ return frame
41
+ else:
42
+ # Insert new Shuffle node
43
+ frame = Shuffle(
44
+ frame.schema,
45
+ on,
46
+ shuffle_method,
47
+ frame,
48
+ )
49
+ partition_info[frame] = PartitionInfo(
50
+ count=output_count,
51
+ partitioned_on=on,
52
+ )
53
+ return frame
54
+
55
+
56
+ def _make_hash_join(
57
+ ir: Join,
58
+ output_count: int,
59
+ partition_info: MutableMapping[IR, PartitionInfo],
60
+ left: IR,
61
+ right: IR,
62
+ shuffle_method: ShuffleMethod,
63
+ ) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
64
+ # Shuffle left and right dataframes (if necessary)
65
+ new_left = _maybe_shuffle_frame(
66
+ left,
67
+ ir.left_on,
68
+ partition_info,
69
+ shuffle_method,
70
+ output_count,
71
+ )
72
+ new_right = _maybe_shuffle_frame(
73
+ right,
74
+ ir.right_on,
75
+ partition_info,
76
+ shuffle_method,
77
+ output_count,
78
+ )
79
+ if left != new_left or right != new_right:
80
+ ir = ir.reconstruct([new_left, new_right])
81
+ left = new_left
82
+ right = new_right
83
+
84
+ # Record new partitioning info
85
+ partitioned_on: tuple[NamedExpr, ...] = ()
86
+ if ir.left_on == ir.right_on or (ir.options[0] in ("Left", "Semi", "Anti")):
87
+ partitioned_on = ir.left_on
88
+ elif ir.options[0] == "Right":
89
+ partitioned_on = ir.right_on
90
+ partition_info[ir] = PartitionInfo(
91
+ count=output_count,
92
+ partitioned_on=partitioned_on,
93
+ )
94
+
95
+ return ir, partition_info
96
+
97
+
98
+ def _should_bcast_join(
99
+ ir: Join,
100
+ left: IR,
101
+ right: IR,
102
+ partition_info: MutableMapping[IR, PartitionInfo],
103
+ output_count: int,
104
+ broadcast_join_limit: int,
105
+ ) -> bool:
106
+ # Decide if a broadcast join is appropriate.
107
+ if partition_info[left].count >= partition_info[right].count:
108
+ small_count = partition_info[right].count
109
+ large = left
110
+ large_on = ir.left_on
111
+ else:
112
+ small_count = partition_info[left].count
113
+ large = right
114
+ large_on = ir.right_on
115
+
116
+ # Avoid the broadcast if the "large" table is already shuffled
117
+ large_shuffled = (
118
+ partition_info[large].partitioned_on == large_on
119
+ and partition_info[large].count == output_count
120
+ )
121
+
122
+ # Broadcast-Join Criteria:
123
+ # 1. Large dataframe isn't already shuffled
124
+ # 2. Small dataframe has 8 partitions (or fewer).
125
+ # TODO: Make this value/heuristic configurable).
126
+ # We may want to account for the number of workers.
127
+ # 3. The "kind" of join is compatible with a broadcast join
128
+
129
+ return (
130
+ not large_shuffled
131
+ and small_count <= broadcast_join_limit
132
+ and (
133
+ ir.options[0] == "Inner"
134
+ or (ir.options[0] in ("Left", "Semi", "Anti") and large == left)
135
+ or (ir.options[0] == "Right" and large == right)
136
+ )
137
+ )
138
+
139
+
140
+ def _make_bcast_join(
141
+ ir: Join,
142
+ output_count: int,
143
+ partition_info: MutableMapping[IR, PartitionInfo],
144
+ left: IR,
145
+ right: IR,
146
+ shuffle_method: ShuffleMethod,
147
+ ) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
148
+ if ir.options[0] != "Inner":
149
+ left_count = partition_info[left].count
150
+ right_count = partition_info[right].count
151
+
152
+ # Shuffle the smaller table (if necessary) - Notes:
153
+ # - We need to shuffle the smaller table if
154
+ # (1) we are not doing an "inner" join,
155
+ # and (2) the small table contains multiple
156
+ # partitions.
157
+ # - We cannot simply join a large-table partition
158
+ # to each small-table partition, and then
159
+ # concatenate the partial-join results, because
160
+ # a non-"inner" join does NOT commute with
161
+ # concatenation.
162
+ # - In some cases, we can perform the partial joins
163
+ # sequentially. However, we are starting with a
164
+ # catch-all algorithm that works for all cases.
165
+ if left_count >= right_count:
166
+ right = _maybe_shuffle_frame(
167
+ right,
168
+ ir.right_on,
169
+ partition_info,
170
+ shuffle_method,
171
+ right_count,
172
+ )
173
+ else:
174
+ left = _maybe_shuffle_frame(
175
+ left,
176
+ ir.left_on,
177
+ partition_info,
178
+ shuffle_method,
179
+ left_count,
180
+ )
181
+
182
+ new_node = ir.reconstruct([left, right])
183
+ partition_info[new_node] = PartitionInfo(count=output_count)
184
+ return new_node, partition_info
185
+
186
+
187
+ @lower_ir_node.register(ConditionalJoin)
188
+ def _(
189
+ ir: ConditionalJoin, rec: LowerIRTransformer
190
+ ) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
191
+ if ir.options[2]: # pragma: no cover
192
+ return _lower_ir_fallback(
193
+ ir,
194
+ rec,
195
+ msg="Slice not supported in ConditionalJoin for multiple partitions.",
196
+ )
197
+
198
+ # Lower children
199
+ left, right = ir.children
200
+ left, pi_left = rec(left)
201
+ right, pi_right = rec(right)
202
+
203
+ # Fallback to single partition on the smaller table
204
+ left_count = pi_left[left].count
205
+ right_count = pi_right[right].count
206
+ output_count = max(left_count, right_count)
207
+ fallback_msg = "ConditionalJoin not supported for multiple partitions."
208
+ if left_count < right_count:
209
+ if left_count > 1:
210
+ left = Repartition(left.schema, left)
211
+ pi_left[left] = PartitionInfo(count=1)
212
+ _fallback_inform(fallback_msg, rec.state["config_options"])
213
+ elif right_count > 1:
214
+ right = Repartition(left.schema, right)
215
+ pi_right[right] = PartitionInfo(count=1)
216
+ _fallback_inform(fallback_msg, rec.state["config_options"])
217
+
218
+ # Reconstruct and return
219
+ new_node = ir.reconstruct([left, right])
220
+ partition_info = reduce(operator.or_, (pi_left, pi_right))
221
+ partition_info[new_node] = PartitionInfo(count=output_count)
222
+ return new_node, partition_info
223
+
224
+
225
+ @lower_ir_node.register(Join)
226
+ def _(
227
+ ir: Join, rec: LowerIRTransformer
228
+ ) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
229
+ # Pull slice operations out of the Join before lowering
230
+ if (zlice := ir.options[2]) is not None:
231
+ offset, length = zlice
232
+ if length is None: # pragma: no cover
233
+ return _lower_ir_fallback(
234
+ ir,
235
+ rec,
236
+ msg="This slice not supported for multiple partitions.",
237
+ )
238
+ new_join = Join(
239
+ ir.schema,
240
+ ir.left_on,
241
+ ir.right_on,
242
+ (*ir.options[:2], None, *ir.options[3:]),
243
+ *ir.children,
244
+ )
245
+ return rec(Slice(ir.schema, offset, length, new_join))
246
+
247
+ # Lower children
248
+ children, _partition_info = zip(*(rec(c) for c in ir.children), strict=True)
249
+ partition_info = reduce(operator.or_, _partition_info)
250
+
251
+ left, right = children
252
+ output_count = max(partition_info[left].count, partition_info[right].count)
253
+ if output_count == 1:
254
+ new_node = ir.reconstruct(children)
255
+ partition_info[new_node] = PartitionInfo(count=1)
256
+ return new_node, partition_info
257
+ elif ir.options[0] == "Cross": # pragma: no cover
258
+ return _lower_ir_fallback(
259
+ ir, rec, msg="Cross join not support for multiple partitions."
260
+ )
261
+
262
+ config_options = rec.state["config_options"]
263
+ assert config_options.executor.name == "streaming", (
264
+ "'in-memory' executor not supported in 'lower_join'"
265
+ )
266
+ if _should_bcast_join(
267
+ ir,
268
+ left,
269
+ right,
270
+ partition_info,
271
+ output_count,
272
+ config_options.executor.broadcast_join_limit,
273
+ ):
274
+ # Create a broadcast join
275
+ return _make_bcast_join(
276
+ ir,
277
+ output_count,
278
+ partition_info,
279
+ left,
280
+ right,
281
+ config_options.executor.shuffle_method,
282
+ )
283
+ else:
284
+ # Create a hash join
285
+ return _make_hash_join(
286
+ ir,
287
+ output_count,
288
+ partition_info,
289
+ left,
290
+ right,
291
+ config_options.executor.shuffle_method,
292
+ )
293
+
294
+
295
+ @generate_ir_tasks.register(Join)
296
+ def _(
297
+ ir: Join, partition_info: MutableMapping[IR, PartitionInfo]
298
+ ) -> MutableMapping[Any, Any]:
299
+ left, right = ir.children
300
+ output_count = partition_info[ir].count
301
+
302
+ left_partitioned = (
303
+ partition_info[left].partitioned_on == ir.left_on
304
+ and partition_info[left].count == output_count
305
+ )
306
+ right_partitioned = (
307
+ partition_info[right].partitioned_on == ir.right_on
308
+ and partition_info[right].count == output_count
309
+ )
310
+
311
+ if output_count == 1 or (left_partitioned and right_partitioned):
312
+ # Partition-wise join
313
+ left_name = get_key_name(left)
314
+ right_name = get_key_name(right)
315
+ return {
316
+ key: (
317
+ ir.do_evaluate,
318
+ *ir._non_child_args,
319
+ (left_name, i),
320
+ (right_name, i),
321
+ )
322
+ for i, key in enumerate(partition_info[ir].keys(ir))
323
+ }
324
+ else:
325
+ # Broadcast join
326
+ left_parts = partition_info[left]
327
+ right_parts = partition_info[right]
328
+ if left_parts.count >= right_parts.count:
329
+ small_side = "Right"
330
+ small_name = get_key_name(right)
331
+ small_size = partition_info[right].count
332
+ large_name = get_key_name(left)
333
+ large_on = ir.left_on
334
+ else:
335
+ small_side = "Left"
336
+ small_name = get_key_name(left)
337
+ small_size = partition_info[left].count
338
+ large_name = get_key_name(right)
339
+ large_on = ir.right_on
340
+
341
+ graph: MutableMapping[Any, Any] = {}
342
+
343
+ out_name = get_key_name(ir)
344
+ out_size = partition_info[ir].count
345
+ split_name = f"split-{out_name}"
346
+ getit_name = f"getit-{out_name}"
347
+ inter_name = f"inter-{out_name}"
348
+
349
+ # Split each large partition if we have
350
+ # multiple small partitions (unless this
351
+ # is an inner join)
352
+ split_large = ir.options[0] != "Inner" and small_size > 1
353
+
354
+ for part_out in range(out_size):
355
+ if split_large:
356
+ graph[(split_name, part_out)] = (
357
+ _hash_partition_dataframe,
358
+ (large_name, part_out),
359
+ part_out,
360
+ small_size,
361
+ None,
362
+ large_on,
363
+ )
364
+
365
+ _concat_list = []
366
+ for j in range(small_size):
367
+ left_key: tuple[str, int] | tuple[str, int, int]
368
+ if split_large:
369
+ left_key = (getit_name, part_out, j)
370
+ graph[left_key] = (operator.getitem, (split_name, part_out), j)
371
+ else:
372
+ left_key = (large_name, part_out)
373
+ join_children = [left_key, (small_name, j)]
374
+ if small_side == "Left":
375
+ join_children.reverse()
376
+
377
+ inter_key = (inter_name, part_out, j)
378
+ graph[(inter_name, part_out, j)] = (
379
+ ir.do_evaluate,
380
+ ir.left_on,
381
+ ir.right_on,
382
+ ir.options,
383
+ *join_children,
384
+ )
385
+ _concat_list.append(inter_key)
386
+ if len(_concat_list) == 1:
387
+ graph[(out_name, part_out)] = graph.pop(_concat_list[0])
388
+ else:
389
+ graph[(out_name, part_out)] = (_concat, *_concat_list)
390
+
391
+ return graph