cudf-polars-cu13 25.12.0__py3-none-any.whl → 26.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. cudf_polars/GIT_COMMIT +1 -1
  2. cudf_polars/VERSION +1 -1
  3. cudf_polars/callback.py +28 -7
  4. cudf_polars/containers/column.py +51 -26
  5. cudf_polars/dsl/expressions/binaryop.py +1 -1
  6. cudf_polars/dsl/expressions/boolean.py +1 -1
  7. cudf_polars/dsl/expressions/selection.py +1 -1
  8. cudf_polars/dsl/expressions/string.py +29 -20
  9. cudf_polars/dsl/expressions/ternary.py +25 -1
  10. cudf_polars/dsl/expressions/unary.py +11 -8
  11. cudf_polars/dsl/ir.py +351 -281
  12. cudf_polars/dsl/translate.py +18 -15
  13. cudf_polars/dsl/utils/aggregations.py +10 -5
  14. cudf_polars/experimental/base.py +10 -0
  15. cudf_polars/experimental/benchmarks/pdsh.py +1 -1
  16. cudf_polars/experimental/benchmarks/utils.py +83 -2
  17. cudf_polars/experimental/distinct.py +2 -0
  18. cudf_polars/experimental/explain.py +1 -1
  19. cudf_polars/experimental/expressions.py +8 -5
  20. cudf_polars/experimental/groupby.py +2 -0
  21. cudf_polars/experimental/io.py +64 -42
  22. cudf_polars/experimental/join.py +15 -2
  23. cudf_polars/experimental/parallel.py +10 -7
  24. cudf_polars/experimental/rapidsmpf/collectives/__init__.py +9 -0
  25. cudf_polars/experimental/rapidsmpf/collectives/allgather.py +90 -0
  26. cudf_polars/experimental/rapidsmpf/collectives/common.py +96 -0
  27. cudf_polars/experimental/rapidsmpf/{shuffle.py → collectives/shuffle.py} +90 -114
  28. cudf_polars/experimental/rapidsmpf/core.py +194 -67
  29. cudf_polars/experimental/rapidsmpf/dask.py +172 -0
  30. cudf_polars/experimental/rapidsmpf/dispatch.py +6 -3
  31. cudf_polars/experimental/rapidsmpf/io.py +162 -70
  32. cudf_polars/experimental/rapidsmpf/join.py +162 -77
  33. cudf_polars/experimental/rapidsmpf/nodes.py +421 -180
  34. cudf_polars/experimental/rapidsmpf/repartition.py +130 -65
  35. cudf_polars/experimental/rapidsmpf/union.py +24 -5
  36. cudf_polars/experimental/rapidsmpf/utils.py +228 -16
  37. cudf_polars/experimental/shuffle.py +18 -4
  38. cudf_polars/experimental/sort.py +13 -6
  39. cudf_polars/experimental/spilling.py +1 -1
  40. cudf_polars/testing/plugin.py +6 -3
  41. cudf_polars/utils/config.py +67 -0
  42. cudf_polars/utils/versions.py +3 -3
  43. {cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/METADATA +9 -10
  44. {cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/RECORD +47 -43
  45. {cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/WHEEL +1 -1
  46. {cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/licenses/LICENSE +0 -0
  47. {cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
  """IO logic for the RapidsMPF streaming runtime."""
4
4
 
@@ -34,9 +34,14 @@ from cudf_polars.experimental.rapidsmpf.dispatch import (
34
34
  )
35
35
  from cudf_polars.experimental.rapidsmpf.nodes import (
36
36
  define_py_node,
37
+ metadata_feeder_node,
37
38
  shutdown_on_error,
38
39
  )
39
- from cudf_polars.experimental.rapidsmpf.utils import ChannelManager
40
+ from cudf_polars.experimental.rapidsmpf.utils import (
41
+ ChannelManager,
42
+ Metadata,
43
+ opaque_reservation,
44
+ )
40
45
 
41
46
  if TYPE_CHECKING:
42
47
  from collections.abc import MutableMapping
@@ -103,7 +108,7 @@ class Lineariser:
103
108
 
104
109
  # Forward any remaining buffered messages
105
110
  for seq in sorted(buffer.keys()):
106
- await self.ch_out.send(self.context, buffer[seq])
111
+ await self.ch_out.send(self.context, buffer.pop(seq))
107
112
 
108
113
  await self.ch_out.drain(self.context)
109
114
 
@@ -138,6 +143,7 @@ async def dataframescan_node(
138
143
  *,
139
144
  num_producers: int,
140
145
  rows_per_partition: int,
146
+ estimated_chunk_bytes: int,
141
147
  ) -> None:
142
148
  """
143
149
  DataFrameScan node for rapidsmpf.
@@ -156,19 +162,26 @@ async def dataframescan_node(
156
162
  The number of producers to use for the DataFrameScan node.
157
163
  rows_per_partition
158
164
  The number of rows per partition.
165
+ estimated_chunk_bytes
166
+ Estimated size of each chunk in bytes. Used for memory reservation
167
+ with block spilling to avoid thrashing.
159
168
  """
160
- nrows = max(ir.df.shape()[0], 1)
161
- global_count = math.ceil(nrows / rows_per_partition)
169
+ async with shutdown_on_error(context, ch_out.metadata, ch_out.data):
170
+ # Find local partition count.
171
+ nrows = ir.df.shape()[0]
172
+ global_count = math.ceil(nrows / rows_per_partition) if nrows > 0 else 0
173
+
174
+ # For single rank, simplify the logic
175
+ if context.comm().nranks == 1:
176
+ local_count = global_count
177
+ local_offset = 0
178
+ else:
179
+ local_count = math.ceil(global_count / context.comm().nranks)
180
+ local_offset = local_count * context.comm().rank
162
181
 
163
- # For single rank, simplify the logic
164
- if context.comm().nranks == 1:
165
- local_count = global_count
166
- local_offset = 0
167
- else:
168
- local_count = math.ceil(global_count / context.comm().nranks)
169
- local_offset = local_count * context.comm().rank
182
+ # Send basic metadata
183
+ await ch_out.send_metadata(context, Metadata(max(1, local_count)))
170
184
 
171
- async with shutdown_on_error(context, ch_out.data):
172
185
  # Build list of IR slices to read
173
186
  ir_slices = []
174
187
  for seq_num in range(local_count):
@@ -183,6 +196,26 @@ async def dataframescan_node(
183
196
  )
184
197
  )
185
198
 
199
+ # If there are no slices, drain the channel and return
200
+ if len(ir_slices) == 0:
201
+ await ch_out.data.drain(context)
202
+ return
203
+
204
+ # If there is only one ir_slices or one producer, we can
205
+ # skip the lineariser and read the chunks directly
206
+ if len(ir_slices) == 1 or num_producers == 1:
207
+ for seq_num, ir_slice in enumerate(ir_slices):
208
+ await read_chunk(
209
+ context,
210
+ ir_slice,
211
+ seq_num,
212
+ ch_out.data,
213
+ ir_context,
214
+ estimated_chunk_bytes,
215
+ )
216
+ await ch_out.data.drain(context)
217
+ return
218
+
186
219
  # Use Lineariser to ensure ordered delivery
187
220
  num_producers = min(num_producers, len(ir_slices))
188
221
  lineariser = Lineariser(context, ch_out.data, num_producers)
@@ -203,6 +236,7 @@ async def dataframescan_node(
203
236
  task_idx,
204
237
  ch_out,
205
238
  ir_context,
239
+ estimated_chunk_bytes,
206
240
  )
207
241
  await ch_out.drain(context)
208
242
 
@@ -216,27 +250,32 @@ async def dataframescan_node(
216
250
  @generate_ir_sub_network.register(DataFrameScan)
217
251
  def _(
218
252
  ir: DataFrameScan, rec: SubNetGenerator
219
- ) -> tuple[list[Any], dict[IR, ChannelManager]]:
253
+ ) -> tuple[dict[IR, list[Any]], dict[IR, ChannelManager]]:
220
254
  config_options = rec.state["config_options"]
221
255
  assert config_options.executor.name == "streaming", (
222
256
  "'in-memory' executor not supported in 'generate_ir_sub_network'"
223
257
  )
224
258
  rows_per_partition = config_options.executor.max_rows_per_partition
225
259
  num_producers = rec.state["max_io_threads"]
260
+ # Use target_partition_size as the estimated chunk size
261
+ estimated_chunk_bytes = config_options.executor.target_partition_size
226
262
 
227
263
  context = rec.state["context"]
228
264
  ir_context = rec.state["ir_context"]
229
265
  channels: dict[IR, ChannelManager] = {ir: ChannelManager(rec.state["context"])}
230
- nodes: list[Any] = [
231
- dataframescan_node(
232
- context,
233
- ir,
234
- ir_context,
235
- channels[ir].reserve_input_slot(),
236
- num_producers=num_producers,
237
- rows_per_partition=rows_per_partition,
238
- )
239
- ]
266
+ nodes: dict[IR, list[Any]] = {
267
+ ir: [
268
+ dataframescan_node(
269
+ context,
270
+ ir,
271
+ ir_context,
272
+ channels[ir].reserve_input_slot(),
273
+ num_producers=num_producers,
274
+ rows_per_partition=rows_per_partition,
275
+ estimated_chunk_bytes=estimated_chunk_bytes,
276
+ )
277
+ ]
278
+ }
240
279
 
241
280
  return nodes, channels
242
281
 
@@ -278,6 +317,7 @@ async def read_chunk(
278
317
  seq_num: int,
279
318
  ch_out: Channel[TableChunk],
280
319
  ir_context: IRExecutionContext,
320
+ estimated_chunk_bytes: int,
281
321
  ) -> None:
282
322
  """
283
323
  Read a chunk from disk and send it to the output channel.
@@ -294,24 +334,27 @@ async def read_chunk(
294
334
  The output channel.
295
335
  ir_context
296
336
  The execution context for the IR node.
337
+ estimated_chunk_bytes
338
+ Estimated size of the chunk in bytes. Used for memory reservation
339
+ with block spilling to avoid thrashing.
297
340
  """
298
- # Evaluate and send the Scan-node result
299
- df = await asyncio.to_thread(
300
- scan.do_evaluate,
301
- *scan._non_child_args,
302
- context=ir_context,
303
- )
304
- await ch_out.send(
305
- context,
306
- Message(
307
- seq_num,
308
- TableChunk.from_pylibcudf_table(
309
- df.table,
310
- df.stream,
311
- exclusive_view=True,
341
+ with opaque_reservation(context, estimated_chunk_bytes):
342
+ df = await asyncio.to_thread(
343
+ scan.do_evaluate,
344
+ *scan._non_child_args,
345
+ context=ir_context,
346
+ )
347
+ await ch_out.send(
348
+ context,
349
+ Message(
350
+ seq_num,
351
+ TableChunk.from_pylibcudf_table(
352
+ df.table,
353
+ df.stream,
354
+ exclusive_view=True,
355
+ ),
312
356
  ),
313
- ),
314
- )
357
+ )
315
358
 
316
359
 
317
360
  @define_py_node()
@@ -324,6 +367,7 @@ async def scan_node(
324
367
  num_producers: int,
325
368
  plan: IOPartitionPlan,
326
369
  parquet_options: ParquetOptions,
370
+ estimated_chunk_bytes: int,
327
371
  ) -> None:
328
372
  """
329
373
  Scan node for rapidsmpf.
@@ -344,8 +388,11 @@ async def scan_node(
344
388
  The partitioning plan.
345
389
  parquet_options
346
390
  The Parquet options.
391
+ estimated_chunk_bytes
392
+ Estimated size of each chunk in bytes. Used for memory reservation
393
+ with block spilling to avoid thrashing.
347
394
  """
348
- async with shutdown_on_error(context, ch_out.data):
395
+ async with shutdown_on_error(context, ch_out.metadata, ch_out.data):
349
396
  # Build a list of local Scan operations
350
397
  scans: list[Scan | SplitScan] = []
351
398
  if plan.flavor == IOPartitionFlavor.SPLIT_FILES:
@@ -353,9 +400,11 @@ async def scan_node(
353
400
  local_count = math.ceil(count / context.comm().nranks)
354
401
  local_offset = local_count * context.comm().rank
355
402
  path_offset = local_offset // plan.factor
356
- path_count = math.ceil(local_count / plan.factor)
403
+ path_end = math.ceil((local_offset + local_count) / plan.factor)
404
+ path_count = path_end - path_offset
357
405
  local_paths = ir.paths[path_offset : path_offset + path_count]
358
406
  sindex = local_offset % plan.factor
407
+ splits_created = 0
359
408
  for path in local_paths:
360
409
  base_scan = Scan(
361
410
  ir.schema,
@@ -371,7 +420,7 @@ async def scan_node(
371
420
  ir.predicate,
372
421
  parquet_options,
373
422
  )
374
- while sindex < plan.factor:
423
+ while sindex < plan.factor and splits_created < local_count:
375
424
  scans.append(
376
425
  SplitScan(
377
426
  ir.schema,
@@ -382,6 +431,7 @@ async def scan_node(
382
431
  )
383
432
  )
384
433
  sindex += 1
434
+ splits_created += 1
385
435
  sindex = 0
386
436
 
387
437
  else:
@@ -392,22 +442,46 @@ async def scan_node(
392
442
  paths_offset_end = paths_offset_start + plan.factor * local_count
393
443
  for offset in range(paths_offset_start, paths_offset_end, plan.factor):
394
444
  local_paths = ir.paths[offset : offset + plan.factor]
395
- scans.append(
396
- Scan(
397
- ir.schema,
398
- ir.typ,
399
- ir.reader_options,
400
- ir.cloud_options,
401
- local_paths,
402
- ir.with_columns,
403
- ir.skip_rows,
404
- ir.n_rows,
405
- ir.row_index,
406
- ir.include_file_paths,
407
- ir.predicate,
408
- parquet_options,
445
+ if len(local_paths) > 0: # Only add scan if there are paths
446
+ scans.append(
447
+ Scan(
448
+ ir.schema,
449
+ ir.typ,
450
+ ir.reader_options,
451
+ ir.cloud_options,
452
+ local_paths,
453
+ ir.with_columns,
454
+ ir.skip_rows,
455
+ ir.n_rows,
456
+ ir.row_index,
457
+ ir.include_file_paths,
458
+ ir.predicate,
459
+ parquet_options,
460
+ )
409
461
  )
462
+
463
+ # Send basic metadata
464
+ await ch_out.send_metadata(context, Metadata(max(1, len(scans))))
465
+
466
+ # If there is nothing to scan, drain the channel and return
467
+ if len(scans) == 0:
468
+ await ch_out.data.drain(context)
469
+ return
470
+
471
+ # If there is only one scan or one producer, we can
472
+ # skip the lineariser and read the chunks directly
473
+ if len(scans) == 1 or num_producers == 1:
474
+ for seq_num, scan in enumerate(scans):
475
+ await read_chunk(
476
+ context,
477
+ scan,
478
+ seq_num,
479
+ ch_out.data,
480
+ ir_context,
481
+ estimated_chunk_bytes,
410
482
  )
483
+ await ch_out.data.drain(context)
484
+ return
411
485
 
412
486
  # Use Lineariser to ensure ordered delivery
413
487
  num_producers = min(num_producers, len(scans))
@@ -429,6 +503,7 @@ async def scan_node(
429
503
  task_idx,
430
504
  ch_out,
431
505
  ir_context,
506
+ estimated_chunk_bytes,
432
507
  )
433
508
  await ch_out.drain(context)
434
509
 
@@ -548,9 +623,12 @@ def make_rapidsmpf_read_parquet_node(
548
623
 
549
624
 
550
625
  @generate_ir_sub_network.register(Scan)
551
- def _(ir: Scan, rec: SubNetGenerator) -> tuple[list[Any], dict[IR, ChannelManager]]:
626
+ def _(
627
+ ir: Scan, rec: SubNetGenerator
628
+ ) -> tuple[dict[IR, list[Any]], dict[IR, ChannelManager]]:
552
629
  config_options = rec.state["config_options"]
553
- assert config_options.executor.name == "streaming", (
630
+ executor = rec.state["config_options"].executor
631
+ assert executor.name == "streaming", (
554
632
  "'in-memory' executor not supported in 'generate_ir_sub_network'"
555
633
  )
556
634
  parquet_options = config_options.parquet_options
@@ -558,17 +636,28 @@ def _(ir: Scan, rec: SubNetGenerator) -> tuple[list[Any], dict[IR, ChannelManage
558
636
  num_producers = rec.state["max_io_threads"]
559
637
  channels: dict[IR, ChannelManager] = {ir: ChannelManager(rec.state["context"])}
560
638
 
561
- # Use rapidsmpf native read_parquet for multi-partition Parquet scans.
639
+ assert partition_info.io_plan is not None, "Scan node must have a partition plan"
640
+ plan: IOPartitionPlan = partition_info.io_plan
641
+
642
+ # Native node cannot split large files in distributed mode yet
643
+ distributed_split_files = (
644
+ plan.flavor == IOPartitionFlavor.SPLIT_FILES
645
+ and rec.state["context"].comm().nranks > 1
646
+ )
647
+
648
+ # Use rapidsmpf native read_parquet node if possible
562
649
  ch_pair = channels[ir].reserve_input_slot()
563
- nodes: list[Any]
650
+ nodes: dict[IR, list[Any]] = {}
564
651
  native_node: Any = None
565
652
  if (
566
- partition_info.count > 1
653
+ parquet_options.use_rapidsmpf_native
654
+ and partition_info.count > 1
567
655
  and ir.typ == "parquet"
568
656
  and ir.row_index is None
569
657
  and ir.include_file_paths is None
570
658
  and ir.n_rows == -1
571
659
  and ir.skip_rows == 0
660
+ and not distributed_split_files
572
661
  ):
573
662
  native_node = make_rapidsmpf_read_parquet_node(
574
663
  rec.state["context"],
@@ -580,17 +669,19 @@ def _(ir: Scan, rec: SubNetGenerator) -> tuple[list[Any], dict[IR, ChannelManage
580
669
  )
581
670
 
582
671
  if native_node is not None:
583
- nodes = [native_node]
672
+ # Need metadata node, because the native read_parquet
673
+ # node does not send metadata.
674
+ metadata_node = metadata_feeder_node(
675
+ rec.state["context"],
676
+ ch_pair,
677
+ Metadata(partition_info.count),
678
+ )
679
+ nodes[ir] = [metadata_node, native_node]
584
680
  else:
585
681
  # Fall back to scan_node (predicate not convertible, or other constraint)
586
- assert partition_info.io_plan is not None, (
587
- "Scan node must have a partition plan"
588
- )
589
- plan: IOPartitionPlan = partition_info.io_plan
590
- if plan.flavor == IOPartitionFlavor.SPLIT_FILES:
591
- parquet_options = dataclasses.replace(parquet_options, chunked=False)
682
+ parquet_options = dataclasses.replace(parquet_options, chunked=False)
592
683
 
593
- nodes = [
684
+ nodes[ir] = [
594
685
  scan_node(
595
686
  rec.state["context"],
596
687
  ir,
@@ -599,6 +690,7 @@ def _(ir: Scan, rec: SubNetGenerator) -> tuple[list[Any], dict[IR, ChannelManage
599
690
  num_producers=num_producers,
600
691
  plan=plan,
601
692
  parquet_options=parquet_options,
693
+ estimated_chunk_bytes=executor.target_partition_size,
602
694
  )
603
695
  ]
604
696
  return nodes, channels