cudf-polars-cu13 25.12.0__py3-none-any.whl → 26.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. cudf_polars/GIT_COMMIT +1 -1
  2. cudf_polars/VERSION +1 -1
  3. cudf_polars/callback.py +28 -7
  4. cudf_polars/containers/column.py +51 -26
  5. cudf_polars/dsl/expressions/binaryop.py +1 -1
  6. cudf_polars/dsl/expressions/boolean.py +1 -1
  7. cudf_polars/dsl/expressions/selection.py +1 -1
  8. cudf_polars/dsl/expressions/string.py +29 -20
  9. cudf_polars/dsl/expressions/ternary.py +25 -1
  10. cudf_polars/dsl/expressions/unary.py +11 -8
  11. cudf_polars/dsl/ir.py +351 -281
  12. cudf_polars/dsl/translate.py +18 -15
  13. cudf_polars/dsl/utils/aggregations.py +10 -5
  14. cudf_polars/experimental/base.py +10 -0
  15. cudf_polars/experimental/benchmarks/pdsh.py +1 -1
  16. cudf_polars/experimental/benchmarks/utils.py +83 -2
  17. cudf_polars/experimental/distinct.py +2 -0
  18. cudf_polars/experimental/explain.py +1 -1
  19. cudf_polars/experimental/expressions.py +8 -5
  20. cudf_polars/experimental/groupby.py +2 -0
  21. cudf_polars/experimental/io.py +64 -42
  22. cudf_polars/experimental/join.py +15 -2
  23. cudf_polars/experimental/parallel.py +10 -7
  24. cudf_polars/experimental/rapidsmpf/collectives/__init__.py +9 -0
  25. cudf_polars/experimental/rapidsmpf/collectives/allgather.py +90 -0
  26. cudf_polars/experimental/rapidsmpf/collectives/common.py +96 -0
  27. cudf_polars/experimental/rapidsmpf/{shuffle.py → collectives/shuffle.py} +90 -114
  28. cudf_polars/experimental/rapidsmpf/core.py +194 -67
  29. cudf_polars/experimental/rapidsmpf/dask.py +172 -0
  30. cudf_polars/experimental/rapidsmpf/dispatch.py +6 -3
  31. cudf_polars/experimental/rapidsmpf/io.py +162 -70
  32. cudf_polars/experimental/rapidsmpf/join.py +162 -77
  33. cudf_polars/experimental/rapidsmpf/nodes.py +421 -180
  34. cudf_polars/experimental/rapidsmpf/repartition.py +130 -65
  35. cudf_polars/experimental/rapidsmpf/union.py +24 -5
  36. cudf_polars/experimental/rapidsmpf/utils.py +228 -16
  37. cudf_polars/experimental/shuffle.py +18 -4
  38. cudf_polars/experimental/sort.py +13 -6
  39. cudf_polars/experimental/spilling.py +1 -1
  40. cudf_polars/testing/plugin.py +6 -3
  41. cudf_polars/utils/config.py +67 -0
  42. cudf_polars/utils/versions.py +3 -3
  43. {cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/METADATA +9 -10
  44. {cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/RECORD +47 -43
  45. {cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/WHEEL +1 -1
  46. {cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/licenses/LICENSE +0 -0
  47. {cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
  """Join logic for the RapidsMPF streaming runtime."""
4
4
 
@@ -7,11 +7,13 @@ from __future__ import annotations
7
7
  import asyncio
8
8
  from typing import TYPE_CHECKING, Any, Literal
9
9
 
10
+ from rapidsmpf.memory.buffer import MemoryType
10
11
  from rapidsmpf.streaming.core.message import Message
11
12
  from rapidsmpf.streaming.cudf.table_chunk import TableChunk
12
13
 
13
14
  from cudf_polars.containers import DataFrame
14
15
  from cudf_polars.dsl.ir import IR, Join
16
+ from cudf_polars.experimental.rapidsmpf.collectives.allgather import AllGatherManager
15
17
  from cudf_polars.experimental.rapidsmpf.dispatch import (
16
18
  generate_ir_sub_network,
17
19
  )
@@ -22,6 +24,10 @@ from cudf_polars.experimental.rapidsmpf.nodes import (
22
24
  )
23
25
  from cudf_polars.experimental.rapidsmpf.utils import (
24
26
  ChannelManager,
27
+ Metadata,
28
+ chunk_to_frame,
29
+ empty_table_chunk,
30
+ opaque_reservation,
25
31
  process_children,
26
32
  )
27
33
  from cudf_polars.experimental.utils import _concat
@@ -34,48 +40,6 @@ if TYPE_CHECKING:
34
40
  from cudf_polars.experimental.rapidsmpf.utils import ChannelPair
35
41
 
36
42
 
37
- async def get_small_table(
38
- context: Context,
39
- small_child: IR,
40
- ch_small: ChannelPair,
41
- ) -> list[DataFrame]:
42
- """
43
- Get the small-table DataFrame partitions from the small-table ChannelPair.
44
-
45
- Parameters
46
- ----------
47
- context
48
- The rapidsmpf context.
49
- small_child
50
- The small-table child IR node.
51
- ch_small
52
- The small-table ChannelPair.
53
-
54
- Returns
55
- -------
56
- list[DataFrame]
57
- The small-table DataFrame partitions.
58
- """
59
- small_chunks = []
60
- while (msg := await ch_small.data.recv(context)) is not None:
61
- small_chunks.append(
62
- TableChunk.from_message(msg).make_available_and_spill(
63
- context.br(), allow_overbooking=True
64
- )
65
- )
66
- assert small_chunks, "Empty small side"
67
-
68
- return [
69
- DataFrame.from_table(
70
- small_chunk.table_view(),
71
- list(small_child.schema.keys()),
72
- list(small_child.schema.values()),
73
- small_chunk.stream,
74
- )
75
- for small_chunk in small_chunks
76
- ]
77
-
78
-
79
43
  @define_py_node()
80
44
  async def broadcast_join_node(
81
45
  context: Context,
@@ -85,6 +49,8 @@ async def broadcast_join_node(
85
49
  ch_left: ChannelPair,
86
50
  ch_right: ChannelPair,
87
51
  broadcast_side: Literal["left", "right"],
52
+ collective_id: int,
53
+ target_partition_size: int,
88
54
  ) -> None:
89
55
  """
90
56
  Join node for rapidsmpf.
@@ -105,33 +71,132 @@ async def broadcast_join_node(
105
71
  The right input ChannelPair.
106
72
  broadcast_side
107
73
  The side to broadcast.
74
+ collective_id
75
+ Pre-allocated collective ID for this operation.
76
+ target_partition_size
77
+ The target partition size in bytes.
108
78
  """
109
- async with shutdown_on_error(context, ch_left.data, ch_right.data, ch_out.data):
79
+ async with shutdown_on_error(
80
+ context,
81
+ ch_left.metadata,
82
+ ch_left.data,
83
+ ch_right.metadata,
84
+ ch_right.data,
85
+ ch_out.metadata,
86
+ ch_out.data,
87
+ ):
88
+ # Receive metadata.
89
+ left_metadata, right_metadata = await asyncio.gather(
90
+ ch_left.recv_metadata(context),
91
+ ch_right.recv_metadata(context),
92
+ )
93
+
94
+ partitioned_on: tuple[str, ...] = ()
110
95
  if broadcast_side == "right":
111
96
  # Broadcast right, stream left
112
97
  small_ch = ch_right
113
98
  large_ch = ch_left
114
99
  small_child = ir.children[1]
115
100
  large_child = ir.children[0]
101
+ chunk_count = left_metadata.count
102
+ partitioned_on = left_metadata.partitioned_on
103
+ small_duplicated = right_metadata.duplicated
116
104
  else:
117
105
  # Broadcast left, stream right
118
106
  small_ch = ch_left
119
107
  large_ch = ch_right
120
108
  small_child = ir.children[0]
121
109
  large_child = ir.children[1]
110
+ chunk_count = right_metadata.count
111
+ small_duplicated = left_metadata.duplicated
112
+ if ir.options[0] == "Right":
113
+ partitioned_on = right_metadata.partitioned_on
122
114
 
123
- # Collect small-side chunks
124
- small_dfs = await get_small_table(context, small_child, small_ch)
125
- if ir.options[0] != "Inner":
126
- # TODO: Use local repartitioning for non-inner joins
127
- small_dfs = [_concat(*small_dfs, context=ir_context)]
115
+ # Send metadata.
116
+ output_metadata = Metadata(
117
+ chunk_count,
118
+ partitioned_on=partitioned_on,
119
+ duplicated=left_metadata.duplicated and right_metadata.duplicated,
120
+ )
121
+ await ch_out.send_metadata(context, output_metadata)
128
122
 
129
- # Stream through large side, joining with the small-side
130
- while (msg := await large_ch.data.recv(context)) is not None:
131
- large_chunk = TableChunk.from_message(msg).make_available_and_spill(
132
- context.br(), allow_overbooking=True
123
+ # Collect small-side (may be empty if no data received)
124
+ small_chunks: list[TableChunk] = []
125
+ small_size = 0
126
+ while (msg := await small_ch.data.recv(context)) is not None:
127
+ small_chunks.append(
128
+ TableChunk.from_message(msg).make_available_and_spill(
129
+ context.br(), allow_overbooking=True
130
+ )
133
131
  )
134
- seq_num = msg.sequence_number
132
+ del msg
133
+ small_size += small_chunks[-1].data_alloc_size(MemoryType.DEVICE)
134
+
135
+ # Allgather is a collective - all ranks must participate even with no local data
136
+ need_allgather = context.comm().nranks > 1 and not small_duplicated
137
+ if need_allgather:
138
+ allgather = AllGatherManager(context, collective_id)
139
+ for s_id in range(len(small_chunks)):
140
+ allgather.insert(s_id, small_chunks.pop(0))
141
+ allgather.insert_finished()
142
+ stream = ir_context.get_cuda_stream()
143
+ # extract_concatenated returns a plc.Table, not a TableChunk
144
+ small_dfs = [
145
+ DataFrame.from_table(
146
+ await allgather.extract_concatenated(stream),
147
+ list(small_child.schema.keys()),
148
+ list(small_child.schema.values()),
149
+ stream,
150
+ )
151
+ ]
152
+ elif len(small_chunks) > 1 and (
153
+ ir.options[0] != "Inner" or small_size < target_partition_size
154
+ ):
155
+ # Pre-concat for non-inner joins, otherwise
156
+ # we need a local shuffle, and face additional
157
+ # memory pressure anyway.
158
+ small_dfs = [
159
+ _concat(
160
+ *[chunk_to_frame(chunk, small_child) for chunk in small_chunks],
161
+ context=ir_context,
162
+ )
163
+ ]
164
+ small_chunks.clear() # small_dfs is not a view of small_chunks anymore
165
+ else:
166
+ small_dfs = [
167
+ chunk_to_frame(small_chunk, small_child) for small_chunk in small_chunks
168
+ ]
169
+
170
+ # Stream through large side, joining with the small-side
171
+ seq_num = 0
172
+ large_chunk_processed = False
173
+ receiving_large_chunks = True
174
+ while receiving_large_chunks:
175
+ msg = await large_ch.data.recv(context)
176
+ if msg is None:
177
+ receiving_large_chunks = False
178
+ if large_chunk_processed:
179
+ # Normal exit - We've processed all large-table data
180
+ break
181
+ elif small_dfs:
182
+ # We received small-table data, but no large-table data.
183
+ # This may never happen, but we can handle it by generating
184
+ # an empty large-table chunk
185
+ stream = ir_context.get_cuda_stream()
186
+ large_chunk = empty_table_chunk(large_child, context, stream)
187
+ else:
188
+ # We received no data for either the small or large table.
189
+ # Drain the output channel and return
190
+ await ch_out.data.drain(context)
191
+ return
192
+ else:
193
+ large_chunk_processed = True
194
+ large_chunk = TableChunk.from_message(msg).make_available_and_spill(
195
+ context.br(), allow_overbooking=True
196
+ )
197
+ seq_num = msg.sequence_number
198
+ del msg
199
+
135
200
  large_df = DataFrame.from_table(
136
201
  large_chunk.table_view(),
137
202
  list(large_child.schema.keys()),
@@ -139,10 +204,17 @@ async def broadcast_join_node(
139
204
  large_chunk.stream,
140
205
  )
141
206
 
142
- # Perform the join
143
- df = _concat(
144
- *[
145
- (
207
+ # Lazily create empty small table if small_dfs is empty
208
+ if not small_dfs:
209
+ stream = ir_context.get_cuda_stream()
210
+ empty_small_chunk = empty_table_chunk(small_child, context, stream)
211
+ small_dfs = [chunk_to_frame(empty_small_chunk, small_child)]
212
+
213
+ large_chunk_size = large_chunk.data_alloc_size(MemoryType.DEVICE)
214
+ input_bytes = large_chunk_size + small_size
215
+ with opaque_reservation(context, input_bytes):
216
+ df = _concat(
217
+ *[
146
218
  await asyncio.to_thread(
147
219
  ir.do_evaluate,
148
220
  *ir._non_child_args,
@@ -153,28 +225,31 @@ async def broadcast_join_node(
153
225
  ),
154
226
  context=ir_context,
155
227
  )
156
- )
157
- for small_df in small_dfs
158
- ],
159
- context=ir_context,
160
- )
228
+ for small_df in small_dfs
229
+ ],
230
+ context=ir_context,
231
+ )
161
232
 
162
- # Send output chunk
163
- await ch_out.data.send(
164
- context,
165
- Message(
166
- seq_num,
167
- TableChunk.from_pylibcudf_table(
168
- df.table, df.stream, exclusive_view=True
233
+ # Send output chunk
234
+ await ch_out.data.send(
235
+ context,
236
+ Message(
237
+ seq_num,
238
+ TableChunk.from_pylibcudf_table(
239
+ df.table, df.stream, exclusive_view=True
240
+ ),
169
241
  ),
170
- ),
171
- )
242
+ )
243
+ del df, large_df, large_chunk
172
244
 
245
+ del small_dfs, small_chunks
173
246
  await ch_out.data.drain(context)
174
247
 
175
248
 
176
249
  @generate_ir_sub_network.register(Join)
177
- def _(ir: Join, rec: SubNetGenerator) -> tuple[list[Any], dict[IR, ChannelManager]]:
250
+ def _(
251
+ ir: Join, rec: SubNetGenerator
252
+ ) -> tuple[dict[IR, list[Any]], dict[IR, ChannelManager]]:
178
253
  # Join operation.
179
254
  left, right = ir.children
180
255
  partition_info = rec.state["partition_info"]
@@ -200,7 +275,8 @@ def _(ir: Join, rec: SubNetGenerator) -> tuple[list[Any], dict[IR, ChannelManage
200
275
 
201
276
  if pwise_join:
202
277
  # Partition-wise join (use default_node_multi)
203
- nodes.append(
278
+ partitioning_index = 1 if ir.options[0] == "Right" else 0
279
+ nodes[ir] = [
204
280
  default_node_multi(
205
281
  rec.state["context"],
206
282
  ir,
@@ -210,8 +286,9 @@ def _(ir: Join, rec: SubNetGenerator) -> tuple[list[Any], dict[IR, ChannelManage
210
286
  channels[left].reserve_output_slot(),
211
287
  channels[right].reserve_output_slot(),
212
288
  ),
289
+ partitioning_index=partitioning_index,
213
290
  )
214
- )
291
+ ]
215
292
  return nodes, channels
216
293
 
217
294
  else:
@@ -223,7 +300,13 @@ def _(ir: Join, rec: SubNetGenerator) -> tuple[list[Any], dict[IR, ChannelManage
223
300
  else:
224
301
  broadcast_side = "left"
225
302
 
226
- nodes.append(
303
+ # Get target partition size
304
+ config_options = rec.state["config_options"]
305
+ executor = config_options.executor
306
+ assert executor.name == "streaming", "Join node requires streaming executor"
307
+ target_partition_size = executor.target_partition_size
308
+
309
+ nodes[ir] = [
227
310
  broadcast_join_node(
228
311
  rec.state["context"],
229
312
  ir,
@@ -232,6 +315,8 @@ def _(ir: Join, rec: SubNetGenerator) -> tuple[list[Any], dict[IR, ChannelManage
232
315
  channels[left].reserve_output_slot(),
233
316
  channels[right].reserve_output_slot(),
234
317
  broadcast_side=broadcast_side,
318
+ collective_id=rec.state["collective_id_map"][ir],
319
+ target_partition_size=target_partition_size,
235
320
  )
236
- )
321
+ ]
237
322
  return nodes, channels