cudf-polars-cu13 25.10.0__py3-none-any.whl → 26.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. cudf_polars/GIT_COMMIT +1 -1
  2. cudf_polars/VERSION +1 -1
  3. cudf_polars/callback.py +60 -15
  4. cudf_polars/containers/column.py +137 -77
  5. cudf_polars/containers/dataframe.py +123 -34
  6. cudf_polars/containers/datatype.py +134 -13
  7. cudf_polars/dsl/expr.py +0 -2
  8. cudf_polars/dsl/expressions/aggregation.py +80 -28
  9. cudf_polars/dsl/expressions/binaryop.py +34 -14
  10. cudf_polars/dsl/expressions/boolean.py +110 -37
  11. cudf_polars/dsl/expressions/datetime.py +59 -30
  12. cudf_polars/dsl/expressions/literal.py +11 -5
  13. cudf_polars/dsl/expressions/rolling.py +460 -119
  14. cudf_polars/dsl/expressions/selection.py +9 -8
  15. cudf_polars/dsl/expressions/slicing.py +1 -1
  16. cudf_polars/dsl/expressions/string.py +256 -114
  17. cudf_polars/dsl/expressions/struct.py +19 -7
  18. cudf_polars/dsl/expressions/ternary.py +33 -3
  19. cudf_polars/dsl/expressions/unary.py +126 -64
  20. cudf_polars/dsl/ir.py +1053 -350
  21. cudf_polars/dsl/to_ast.py +30 -13
  22. cudf_polars/dsl/tracing.py +194 -0
  23. cudf_polars/dsl/translate.py +307 -107
  24. cudf_polars/dsl/utils/aggregations.py +43 -30
  25. cudf_polars/dsl/utils/reshape.py +14 -2
  26. cudf_polars/dsl/utils/rolling.py +12 -8
  27. cudf_polars/dsl/utils/windows.py +35 -20
  28. cudf_polars/experimental/base.py +55 -2
  29. cudf_polars/experimental/benchmarks/pdsds.py +12 -126
  30. cudf_polars/experimental/benchmarks/pdsh.py +792 -2
  31. cudf_polars/experimental/benchmarks/utils.py +596 -39
  32. cudf_polars/experimental/dask_registers.py +47 -20
  33. cudf_polars/experimental/dispatch.py +9 -3
  34. cudf_polars/experimental/distinct.py +2 -0
  35. cudf_polars/experimental/explain.py +15 -2
  36. cudf_polars/experimental/expressions.py +30 -15
  37. cudf_polars/experimental/groupby.py +25 -4
  38. cudf_polars/experimental/io.py +156 -124
  39. cudf_polars/experimental/join.py +53 -23
  40. cudf_polars/experimental/parallel.py +68 -19
  41. cudf_polars/experimental/rapidsmpf/__init__.py +8 -0
  42. cudf_polars/experimental/rapidsmpf/collectives/__init__.py +9 -0
  43. cudf_polars/experimental/rapidsmpf/collectives/allgather.py +90 -0
  44. cudf_polars/experimental/rapidsmpf/collectives/common.py +96 -0
  45. cudf_polars/experimental/rapidsmpf/collectives/shuffle.py +253 -0
  46. cudf_polars/experimental/rapidsmpf/core.py +488 -0
  47. cudf_polars/experimental/rapidsmpf/dask.py +172 -0
  48. cudf_polars/experimental/rapidsmpf/dispatch.py +153 -0
  49. cudf_polars/experimental/rapidsmpf/io.py +696 -0
  50. cudf_polars/experimental/rapidsmpf/join.py +322 -0
  51. cudf_polars/experimental/rapidsmpf/lower.py +74 -0
  52. cudf_polars/experimental/rapidsmpf/nodes.py +735 -0
  53. cudf_polars/experimental/rapidsmpf/repartition.py +216 -0
  54. cudf_polars/experimental/rapidsmpf/union.py +115 -0
  55. cudf_polars/experimental/rapidsmpf/utils.py +374 -0
  56. cudf_polars/experimental/repartition.py +9 -2
  57. cudf_polars/experimental/select.py +177 -14
  58. cudf_polars/experimental/shuffle.py +46 -12
  59. cudf_polars/experimental/sort.py +100 -26
  60. cudf_polars/experimental/spilling.py +1 -1
  61. cudf_polars/experimental/statistics.py +24 -5
  62. cudf_polars/experimental/utils.py +25 -7
  63. cudf_polars/testing/asserts.py +13 -8
  64. cudf_polars/testing/io.py +2 -1
  65. cudf_polars/testing/plugin.py +93 -17
  66. cudf_polars/typing/__init__.py +86 -32
  67. cudf_polars/utils/config.py +473 -58
  68. cudf_polars/utils/cuda_stream.py +70 -0
  69. cudf_polars/utils/versions.py +5 -4
  70. cudf_polars_cu13-26.2.0.dist-info/METADATA +181 -0
  71. cudf_polars_cu13-26.2.0.dist-info/RECORD +108 -0
  72. {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/WHEEL +1 -1
  73. cudf_polars_cu13-25.10.0.dist-info/METADATA +0 -136
  74. cudf_polars_cu13-25.10.0.dist-info/RECORD +0 -92
  75. {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/licenses/LICENSE +0 -0
  76. {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,696 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ """IO logic for the RapidsMPF streaming runtime."""
4
+
5
+ from __future__ import annotations
6
+
7
+ import asyncio
8
+ import dataclasses
9
+ import math
10
+ from typing import TYPE_CHECKING, Any
11
+
12
+ from rapidsmpf.streaming.core.message import Message
13
+ from rapidsmpf.streaming.cudf.table_chunk import TableChunk
14
+
15
+ import pylibcudf as plc
16
+
17
+ from cudf_polars.dsl.ir import (
18
+ IR,
19
+ DataFrameScan,
20
+ Scan,
21
+ _cast_literals_to_physical_types,
22
+ _parquet_physical_types,
23
+ )
24
+ from cudf_polars.dsl.to_ast import to_parquet_filter
25
+ from cudf_polars.experimental.base import (
26
+ IOPartitionFlavor,
27
+ IOPartitionPlan,
28
+ PartitionInfo,
29
+ )
30
+ from cudf_polars.experimental.io import SplitScan, scan_partition_plan
31
+ from cudf_polars.experimental.rapidsmpf.dispatch import (
32
+ generate_ir_sub_network,
33
+ lower_ir_node,
34
+ )
35
+ from cudf_polars.experimental.rapidsmpf.nodes import (
36
+ define_py_node,
37
+ metadata_feeder_node,
38
+ shutdown_on_error,
39
+ )
40
+ from cudf_polars.experimental.rapidsmpf.utils import (
41
+ ChannelManager,
42
+ Metadata,
43
+ opaque_reservation,
44
+ )
45
+
46
+ if TYPE_CHECKING:
47
+ from collections.abc import MutableMapping
48
+
49
+ from rapidsmpf.streaming.core.channel import Channel
50
+ from rapidsmpf.streaming.core.context import Context
51
+
52
+ from cudf_polars.dsl.ir import IR, IRExecutionContext
53
+ from cudf_polars.experimental.base import ColumnStat, StatsCollector
54
+ from cudf_polars.experimental.rapidsmpf.core import SubNetGenerator
55
+ from cudf_polars.experimental.rapidsmpf.dispatch import LowerIRTransformer
56
+ from cudf_polars.experimental.rapidsmpf.utils import ChannelPair
57
+ from cudf_polars.utils.config import ParquetOptions
58
+
59
+
60
+ class Lineariser:
61
+ """
62
+ Linearizer that ensures ordered delivery from multiple concurrent producers.
63
+
64
+ Creates one input channel per producer and streams messages to output
65
+ in sequence-number order, buffering only out-of-order arrivals.
66
+ """
67
+
68
+ def __init__(
69
+ self, context: Context, ch_out: Channel[TableChunk], num_producers: int
70
+ ):
71
+ self.context = context
72
+ self.ch_out = ch_out
73
+ self.num_producers = num_producers
74
+ self.input_channels = [context.create_channel() for _ in range(num_producers)]
75
+
76
+ async def drain(self) -> None:
77
+ """
78
+ Drain producer channels and forward messages in sequence-number order.
79
+
80
+ Streams messages to output as soon as they arrive in order, buffering
81
+ only out-of-order messages to minimize memory pressure.
82
+ """
83
+ next_seq = 0
84
+ buffer = {}
85
+
86
+ pending_tasks = {
87
+ asyncio.create_task(ch.recv(self.context)): ch for ch in self.input_channels
88
+ }
89
+
90
+ while pending_tasks:
91
+ done, _ = await asyncio.wait(
92
+ pending_tasks.keys(), return_when=asyncio.FIRST_COMPLETED
93
+ )
94
+
95
+ for task in done:
96
+ ch = pending_tasks.pop(task)
97
+ msg = await task
98
+
99
+ if msg is not None:
100
+ buffer[msg.sequence_number] = msg
101
+ new_task = asyncio.create_task(ch.recv(self.context))
102
+ pending_tasks[new_task] = ch
103
+
104
+ # Forward consecutive messages
105
+ while next_seq in buffer:
106
+ await self.ch_out.send(self.context, buffer.pop(next_seq))
107
+ next_seq += 1
108
+
109
+ # Forward any remaining buffered messages
110
+ for seq in sorted(buffer.keys()):
111
+ await self.ch_out.send(self.context, buffer.pop(seq))
112
+
113
+ await self.ch_out.drain(self.context)
114
+
115
+
116
+ @lower_ir_node.register(DataFrameScan)
117
+ def _(
118
+ ir: DataFrameScan, rec: LowerIRTransformer
119
+ ) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
120
+ config_options = rec.state["config_options"]
121
+ assert config_options.executor.name == "streaming", (
122
+ "'in-memory' executor not supported in 'lower_ir_node_rapidsmpf'"
123
+ )
124
+
125
+ # NOTE: We calculate the expected partition count
126
+ # to help trigger fallback warnings in lower_ir_graph.
127
+ # The generate_ir_sub_network logic is NOT required
128
+ # to obey this partition count. However, the count
129
+ # WILL match after an IO operation (for now).
130
+ rows_per_partition = config_options.executor.max_rows_per_partition
131
+ nrows = max(ir.df.shape()[0], 1)
132
+ count = math.ceil(nrows / rows_per_partition)
133
+
134
+ return ir, {ir: PartitionInfo(count=count)}
135
+
136
+
137
+ @define_py_node()
138
+ async def dataframescan_node(
139
+ context: Context,
140
+ ir: DataFrameScan,
141
+ ir_context: IRExecutionContext,
142
+ ch_out: ChannelPair,
143
+ *,
144
+ num_producers: int,
145
+ rows_per_partition: int,
146
+ estimated_chunk_bytes: int,
147
+ ) -> None:
148
+ """
149
+ DataFrameScan node for rapidsmpf.
150
+
151
+ Parameters
152
+ ----------
153
+ context
154
+ The rapidsmpf context.
155
+ ir
156
+ The DataFrameScan node.
157
+ ir_context
158
+ The execution context for the IR node.
159
+ ch_out
160
+ The output ChannelPair.
161
+ num_producers
162
+ The number of producers to use for the DataFrameScan node.
163
+ rows_per_partition
164
+ The number of rows per partition.
165
+ estimated_chunk_bytes
166
+ Estimated size of each chunk in bytes. Used for memory reservation
167
+ with block spilling to avoid thrashing.
168
+ """
169
+ async with shutdown_on_error(context, ch_out.metadata, ch_out.data):
170
+ # Find local partition count.
171
+ nrows = ir.df.shape()[0]
172
+ global_count = math.ceil(nrows / rows_per_partition) if nrows > 0 else 0
173
+
174
+ # For single rank, simplify the logic
175
+ if context.comm().nranks == 1:
176
+ local_count = global_count
177
+ local_offset = 0
178
+ else:
179
+ local_count = math.ceil(global_count / context.comm().nranks)
180
+ local_offset = local_count * context.comm().rank
181
+
182
+ # Send basic metadata
183
+ await ch_out.send_metadata(context, Metadata(max(1, local_count)))
184
+
185
+ # Build list of IR slices to read
186
+ ir_slices = []
187
+ for seq_num in range(local_count):
188
+ offset = local_offset * rows_per_partition + seq_num * rows_per_partition
189
+ if offset >= nrows:
190
+ break
191
+ ir_slices.append(
192
+ DataFrameScan(
193
+ ir.schema,
194
+ ir.df.slice(offset, rows_per_partition),
195
+ ir.projection,
196
+ )
197
+ )
198
+
199
+ # If there are no slices, drain the channel and return
200
+ if len(ir_slices) == 0:
201
+ await ch_out.data.drain(context)
202
+ return
203
+
204
+ # If there is only one ir_slices or one producer, we can
205
+ # skip the lineariser and read the chunks directly
206
+ if len(ir_slices) == 1 or num_producers == 1:
207
+ for seq_num, ir_slice in enumerate(ir_slices):
208
+ await read_chunk(
209
+ context,
210
+ ir_slice,
211
+ seq_num,
212
+ ch_out.data,
213
+ ir_context,
214
+ estimated_chunk_bytes,
215
+ )
216
+ await ch_out.data.drain(context)
217
+ return
218
+
219
+ # Use Lineariser to ensure ordered delivery
220
+ num_producers = min(num_producers, len(ir_slices))
221
+ lineariser = Lineariser(context, ch_out.data, num_producers)
222
+
223
+ # Assign tasks to producers using round-robin
224
+ producer_tasks: list[list[tuple[int, DataFrameScan]]] = [
225
+ [] for _ in range(num_producers)
226
+ ]
227
+ for task_idx, ir_slice in enumerate(ir_slices):
228
+ producer_id = task_idx % num_producers
229
+ producer_tasks[producer_id].append((task_idx, ir_slice))
230
+
231
+ async def _producer(producer_id: int, ch_out: Channel) -> None:
232
+ for task_idx, ir_slice in producer_tasks[producer_id]:
233
+ await read_chunk(
234
+ context,
235
+ ir_slice,
236
+ task_idx,
237
+ ch_out,
238
+ ir_context,
239
+ estimated_chunk_bytes,
240
+ )
241
+ await ch_out.drain(context)
242
+
243
+ tasks = [lineariser.drain()]
244
+ tasks.extend(
245
+ _producer(i, ch_in) for i, ch_in in enumerate(lineariser.input_channels)
246
+ )
247
+ await asyncio.gather(*tasks)
248
+
249
+
250
+ @generate_ir_sub_network.register(DataFrameScan)
251
+ def _(
252
+ ir: DataFrameScan, rec: SubNetGenerator
253
+ ) -> tuple[dict[IR, list[Any]], dict[IR, ChannelManager]]:
254
+ config_options = rec.state["config_options"]
255
+ assert config_options.executor.name == "streaming", (
256
+ "'in-memory' executor not supported in 'generate_ir_sub_network'"
257
+ )
258
+ rows_per_partition = config_options.executor.max_rows_per_partition
259
+ num_producers = rec.state["max_io_threads"]
260
+ # Use target_partition_size as the estimated chunk size
261
+ estimated_chunk_bytes = config_options.executor.target_partition_size
262
+
263
+ context = rec.state["context"]
264
+ ir_context = rec.state["ir_context"]
265
+ channels: dict[IR, ChannelManager] = {ir: ChannelManager(rec.state["context"])}
266
+ nodes: dict[IR, list[Any]] = {
267
+ ir: [
268
+ dataframescan_node(
269
+ context,
270
+ ir,
271
+ ir_context,
272
+ channels[ir].reserve_input_slot(),
273
+ num_producers=num_producers,
274
+ rows_per_partition=rows_per_partition,
275
+ estimated_chunk_bytes=estimated_chunk_bytes,
276
+ )
277
+ ]
278
+ }
279
+
280
+ return nodes, channels
281
+
282
+
283
+ @lower_ir_node.register(Scan)
284
+ def _(
285
+ ir: Scan, rec: LowerIRTransformer
286
+ ) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
287
+ config_options = rec.state["config_options"]
288
+ if (
289
+ ir.typ in ("csv", "parquet", "ndjson")
290
+ and ir.n_rows == -1
291
+ and ir.skip_rows == 0
292
+ and ir.row_index is None
293
+ ):
294
+ # NOTE: We calculate the expected partition count
295
+ # to help trigger fallback warnings in lower_ir_graph.
296
+ # The generate_ir_sub_network logic is NOT required
297
+ # to obey this partition count. However, the count
298
+ # WILL match after an IO operation (for now).
299
+ plan = scan_partition_plan(ir, rec.state["stats"], config_options)
300
+ paths = list(ir.paths)
301
+ if plan.flavor == IOPartitionFlavor.SPLIT_FILES:
302
+ count = plan.factor * len(paths)
303
+ else:
304
+ count = math.ceil(len(paths) / plan.factor)
305
+
306
+ return ir, {ir: PartitionInfo(count=count, io_plan=plan)}
307
+ else:
308
+ plan = IOPartitionPlan(
309
+ flavor=IOPartitionFlavor.SINGLE_READ, factor=len(ir.paths)
310
+ )
311
+ return ir, {ir: PartitionInfo(count=1, io_plan=plan)}
312
+
313
+
314
+ async def read_chunk(
315
+ context: Context,
316
+ scan: IR,
317
+ seq_num: int,
318
+ ch_out: Channel[TableChunk],
319
+ ir_context: IRExecutionContext,
320
+ estimated_chunk_bytes: int,
321
+ ) -> None:
322
+ """
323
+ Read a chunk from disk and send it to the output channel.
324
+
325
+ Parameters
326
+ ----------
327
+ context
328
+ The rapidsmpf context.
329
+ scan
330
+ The Scan or DataFrameScan node.
331
+ seq_num
332
+ The sequence number.
333
+ ch_out
334
+ The output channel.
335
+ ir_context
336
+ The execution context for the IR node.
337
+ estimated_chunk_bytes
338
+ Estimated size of the chunk in bytes. Used for memory reservation
339
+ with block spilling to avoid thrashing.
340
+ """
341
+ with opaque_reservation(context, estimated_chunk_bytes):
342
+ df = await asyncio.to_thread(
343
+ scan.do_evaluate,
344
+ *scan._non_child_args,
345
+ context=ir_context,
346
+ )
347
+ await ch_out.send(
348
+ context,
349
+ Message(
350
+ seq_num,
351
+ TableChunk.from_pylibcudf_table(
352
+ df.table,
353
+ df.stream,
354
+ exclusive_view=True,
355
+ ),
356
+ ),
357
+ )
358
+
359
+
360
+ @define_py_node()
361
+ async def scan_node(
362
+ context: Context,
363
+ ir: Scan,
364
+ ir_context: IRExecutionContext,
365
+ ch_out: ChannelPair,
366
+ *,
367
+ num_producers: int,
368
+ plan: IOPartitionPlan,
369
+ parquet_options: ParquetOptions,
370
+ estimated_chunk_bytes: int,
371
+ ) -> None:
372
+ """
373
+ Scan node for rapidsmpf.
374
+
375
+ Parameters
376
+ ----------
377
+ context
378
+ The rapidsmpf context.
379
+ ir
380
+ The Scan node.
381
+ ir_context
382
+ The execution context for the IR node.
383
+ ch_out
384
+ The output ChannelPair.
385
+ num_producers
386
+ The number of producers to use for the scan node.
387
+ plan
388
+ The partitioning plan.
389
+ parquet_options
390
+ The Parquet options.
391
+ estimated_chunk_bytes
392
+ Estimated size of each chunk in bytes. Used for memory reservation
393
+ with block spilling to avoid thrashing.
394
+ """
395
+ async with shutdown_on_error(context, ch_out.metadata, ch_out.data):
396
+ # Build a list of local Scan operations
397
+ scans: list[Scan | SplitScan] = []
398
+ if plan.flavor == IOPartitionFlavor.SPLIT_FILES:
399
+ count = plan.factor * len(ir.paths)
400
+ local_count = math.ceil(count / context.comm().nranks)
401
+ local_offset = local_count * context.comm().rank
402
+ path_offset = local_offset // plan.factor
403
+ path_end = math.ceil((local_offset + local_count) / plan.factor)
404
+ path_count = path_end - path_offset
405
+ local_paths = ir.paths[path_offset : path_offset + path_count]
406
+ sindex = local_offset % plan.factor
407
+ splits_created = 0
408
+ for path in local_paths:
409
+ base_scan = Scan(
410
+ ir.schema,
411
+ ir.typ,
412
+ ir.reader_options,
413
+ ir.cloud_options,
414
+ [path],
415
+ ir.with_columns,
416
+ ir.skip_rows,
417
+ ir.n_rows,
418
+ ir.row_index,
419
+ ir.include_file_paths,
420
+ ir.predicate,
421
+ parquet_options,
422
+ )
423
+ while sindex < plan.factor and splits_created < local_count:
424
+ scans.append(
425
+ SplitScan(
426
+ ir.schema,
427
+ base_scan,
428
+ sindex,
429
+ plan.factor,
430
+ parquet_options,
431
+ )
432
+ )
433
+ sindex += 1
434
+ splits_created += 1
435
+ sindex = 0
436
+
437
+ else:
438
+ count = math.ceil(len(ir.paths) / plan.factor)
439
+ local_count = math.ceil(count / context.comm().nranks)
440
+ local_offset = local_count * context.comm().rank
441
+ paths_offset_start = local_offset * plan.factor
442
+ paths_offset_end = paths_offset_start + plan.factor * local_count
443
+ for offset in range(paths_offset_start, paths_offset_end, plan.factor):
444
+ local_paths = ir.paths[offset : offset + plan.factor]
445
+ if len(local_paths) > 0: # Only add scan if there are paths
446
+ scans.append(
447
+ Scan(
448
+ ir.schema,
449
+ ir.typ,
450
+ ir.reader_options,
451
+ ir.cloud_options,
452
+ local_paths,
453
+ ir.with_columns,
454
+ ir.skip_rows,
455
+ ir.n_rows,
456
+ ir.row_index,
457
+ ir.include_file_paths,
458
+ ir.predicate,
459
+ parquet_options,
460
+ )
461
+ )
462
+
463
+ # Send basic metadata
464
+ await ch_out.send_metadata(context, Metadata(max(1, len(scans))))
465
+
466
+ # If there is nothing to scan, drain the channel and return
467
+ if len(scans) == 0:
468
+ await ch_out.data.drain(context)
469
+ return
470
+
471
+ # If there is only one scan or one producer, we can
472
+ # skip the lineariser and read the chunks directly
473
+ if len(scans) == 1 or num_producers == 1:
474
+ for seq_num, scan in enumerate(scans):
475
+ await read_chunk(
476
+ context,
477
+ scan,
478
+ seq_num,
479
+ ch_out.data,
480
+ ir_context,
481
+ estimated_chunk_bytes,
482
+ )
483
+ await ch_out.data.drain(context)
484
+ return
485
+
486
+ # Use Lineariser to ensure ordered delivery
487
+ num_producers = min(num_producers, len(scans))
488
+ lineariser = Lineariser(context, ch_out.data, num_producers)
489
+
490
+ # Assign tasks to producers using round-robin
491
+ producer_tasks: list[list[tuple[int, Scan | SplitScan]]] = [
492
+ [] for _ in range(num_producers)
493
+ ]
494
+ for task_idx, scan in enumerate(scans):
495
+ producer_id = task_idx % num_producers
496
+ producer_tasks[producer_id].append((task_idx, scan))
497
+
498
+ async def _producer(producer_id: int, ch_out: Channel) -> None:
499
+ for task_idx, scan in producer_tasks[producer_id]:
500
+ await read_chunk(
501
+ context,
502
+ scan,
503
+ task_idx,
504
+ ch_out,
505
+ ir_context,
506
+ estimated_chunk_bytes,
507
+ )
508
+ await ch_out.drain(context)
509
+
510
+ tasks = [lineariser.drain()]
511
+ tasks.extend(
512
+ _producer(i, ch_in) for i, ch_in in enumerate(lineariser.input_channels)
513
+ )
514
+ await asyncio.gather(*tasks)
515
+
516
+
517
+ def make_rapidsmpf_read_parquet_node(
518
+ context: Context,
519
+ ir: Scan,
520
+ num_producers: int,
521
+ ch_out: ChannelPair,
522
+ stats: StatsCollector,
523
+ partition_info: PartitionInfo,
524
+ ) -> Any | None:
525
+ """
526
+ Make a RapidsMPF read parquet node.
527
+
528
+ Parameters
529
+ ----------
530
+ context
531
+ The rapidsmpf context.
532
+ ir
533
+ The Scan node.
534
+ num_producers
535
+ The number of producers to use for the scan node.
536
+ ch_out
537
+ The output ChannelPair.
538
+ stats
539
+ The statistics collector.
540
+ partition_info
541
+ The partition information.
542
+
543
+ Returns
544
+ -------
545
+ The RapidsMPF read parquet node, or None if the predicate cannot be
546
+ converted to a parquet filter (caller should fall back to scan_node).
547
+ """
548
+ from rapidsmpf.streaming.cudf.parquet import Filter, read_parquet
549
+
550
+ # Build ParquetReaderOptions
551
+ try:
552
+ stream = context.get_stream_from_pool()
553
+ parquet_reader_options = plc.io.parquet.ParquetReaderOptions.builder(
554
+ plc.io.SourceInfo(ir.paths)
555
+ ).build()
556
+
557
+ if ir.with_columns is not None:
558
+ parquet_reader_options.set_columns(ir.with_columns)
559
+
560
+ # Build predicate filter if present (passed separately to read_parquet)
561
+ filter_obj = None
562
+ if ir.predicate is not None:
563
+ filter_expr = to_parquet_filter(
564
+ _cast_literals_to_physical_types(
565
+ ir.predicate.value,
566
+ _parquet_physical_types(
567
+ ir.schema,
568
+ ir.paths,
569
+ ir.with_columns or list(ir.schema.keys()),
570
+ stream,
571
+ ),
572
+ ),
573
+ stream=stream,
574
+ )
575
+ if filter_expr is None:
576
+ # Predicate cannot be converted to parquet filter
577
+ # Return None to signal fallback to scan_node
578
+ return None
579
+ filter_obj = Filter(stream, filter_expr)
580
+ except Exception as e:
581
+ raise ValueError(f"Failed to build ParquetReaderOptions: {e}") from e
582
+
583
+ # Calculate num_rows_per_chunk from statistics
584
+ # Default to a reasonable chunk size if statistics are unavailable
585
+ estimated_row_count: ColumnStat[int] | None = stats.row_count.get(ir)
586
+ if estimated_row_count is None:
587
+ for cs in stats.column_stats.get(ir, {}).values():
588
+ if cs.source_info.row_count.value is not None:
589
+ estimated_row_count = cs.source_info.row_count
590
+ break
591
+ if estimated_row_count is not None and estimated_row_count.value is not None:
592
+ num_rows_per_chunk = int(
593
+ max(1, estimated_row_count.value // partition_info.count)
594
+ )
595
+ else:
596
+ # Fallback: use a default chunk size if statistics are not available
597
+ num_rows_per_chunk = 1_000_000 # 1 million rows as default
598
+
599
+ # Validate inputs
600
+ if num_rows_per_chunk <= 0:
601
+ raise ValueError(f"Invalid num_rows_per_chunk: {num_rows_per_chunk}")
602
+ if num_producers <= 0:
603
+ raise ValueError(f"Invalid num_producers: {num_producers}")
604
+
605
+ try:
606
+ return read_parquet(
607
+ context,
608
+ ch_out.data,
609
+ num_producers,
610
+ parquet_reader_options,
611
+ num_rows_per_chunk,
612
+ filter=filter_obj,
613
+ )
614
+ except Exception as e:
615
+ raise RuntimeError(
616
+ f"Failed to create read_parquet node: {e}\n"
617
+ f" paths: {ir.paths}\n"
618
+ f" num_producers: {num_producers}\n"
619
+ f" num_rows_per_chunk: {num_rows_per_chunk}\n"
620
+ f" partition_count: {partition_info.count}\n"
621
+ f" filter: {filter_obj}"
622
+ ) from e
623
+
624
+
625
+ @generate_ir_sub_network.register(Scan)
626
+ def _(
627
+ ir: Scan, rec: SubNetGenerator
628
+ ) -> tuple[dict[IR, list[Any]], dict[IR, ChannelManager]]:
629
+ config_options = rec.state["config_options"]
630
+ executor = rec.state["config_options"].executor
631
+ assert executor.name == "streaming", (
632
+ "'in-memory' executor not supported in 'generate_ir_sub_network'"
633
+ )
634
+ parquet_options = config_options.parquet_options
635
+ partition_info = rec.state["partition_info"][ir]
636
+ num_producers = rec.state["max_io_threads"]
637
+ channels: dict[IR, ChannelManager] = {ir: ChannelManager(rec.state["context"])}
638
+
639
+ assert partition_info.io_plan is not None, "Scan node must have a partition plan"
640
+ plan: IOPartitionPlan = partition_info.io_plan
641
+
642
+ # Native node cannot split large files in distributed mode yet
643
+ distributed_split_files = (
644
+ plan.flavor == IOPartitionFlavor.SPLIT_FILES
645
+ and rec.state["context"].comm().nranks > 1
646
+ )
647
+
648
+ # Use rapidsmpf native read_parquet node if possible
649
+ ch_pair = channels[ir].reserve_input_slot()
650
+ nodes: dict[IR, list[Any]] = {}
651
+ native_node: Any = None
652
+ if (
653
+ parquet_options.use_rapidsmpf_native
654
+ and partition_info.count > 1
655
+ and ir.typ == "parquet"
656
+ and ir.row_index is None
657
+ and ir.include_file_paths is None
658
+ and ir.n_rows == -1
659
+ and ir.skip_rows == 0
660
+ and not distributed_split_files
661
+ ):
662
+ native_node = make_rapidsmpf_read_parquet_node(
663
+ rec.state["context"],
664
+ ir,
665
+ num_producers,
666
+ ch_pair,
667
+ rec.state["stats"],
668
+ partition_info,
669
+ )
670
+
671
+ if native_node is not None:
672
+ # Need metadata node, because the native read_parquet
673
+ # node does not send metadata.
674
+ metadata_node = metadata_feeder_node(
675
+ rec.state["context"],
676
+ ch_pair,
677
+ Metadata(partition_info.count),
678
+ )
679
+ nodes[ir] = [metadata_node, native_node]
680
+ else:
681
+ # Fall back to scan_node (predicate not convertible, or other constraint)
682
+ parquet_options = dataclasses.replace(parquet_options, chunked=False)
683
+
684
+ nodes[ir] = [
685
+ scan_node(
686
+ rec.state["context"],
687
+ ir,
688
+ rec.state["ir_context"],
689
+ ch_pair,
690
+ num_producers=num_producers,
691
+ plan=plan,
692
+ parquet_options=parquet_options,
693
+ estimated_chunk_bytes=executor.target_partition_size,
694
+ )
695
+ ]
696
+ return nodes, channels