cudf-polars-cu13 25.10.0__py3-none-any.whl → 26.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudf_polars/GIT_COMMIT +1 -1
- cudf_polars/VERSION +1 -1
- cudf_polars/callback.py +60 -15
- cudf_polars/containers/column.py +137 -77
- cudf_polars/containers/dataframe.py +123 -34
- cudf_polars/containers/datatype.py +134 -13
- cudf_polars/dsl/expr.py +0 -2
- cudf_polars/dsl/expressions/aggregation.py +80 -28
- cudf_polars/dsl/expressions/binaryop.py +34 -14
- cudf_polars/dsl/expressions/boolean.py +110 -37
- cudf_polars/dsl/expressions/datetime.py +59 -30
- cudf_polars/dsl/expressions/literal.py +11 -5
- cudf_polars/dsl/expressions/rolling.py +460 -119
- cudf_polars/dsl/expressions/selection.py +9 -8
- cudf_polars/dsl/expressions/slicing.py +1 -1
- cudf_polars/dsl/expressions/string.py +256 -114
- cudf_polars/dsl/expressions/struct.py +19 -7
- cudf_polars/dsl/expressions/ternary.py +33 -3
- cudf_polars/dsl/expressions/unary.py +126 -64
- cudf_polars/dsl/ir.py +1053 -350
- cudf_polars/dsl/to_ast.py +30 -13
- cudf_polars/dsl/tracing.py +194 -0
- cudf_polars/dsl/translate.py +307 -107
- cudf_polars/dsl/utils/aggregations.py +43 -30
- cudf_polars/dsl/utils/reshape.py +14 -2
- cudf_polars/dsl/utils/rolling.py +12 -8
- cudf_polars/dsl/utils/windows.py +35 -20
- cudf_polars/experimental/base.py +55 -2
- cudf_polars/experimental/benchmarks/pdsds.py +12 -126
- cudf_polars/experimental/benchmarks/pdsh.py +792 -2
- cudf_polars/experimental/benchmarks/utils.py +596 -39
- cudf_polars/experimental/dask_registers.py +47 -20
- cudf_polars/experimental/dispatch.py +9 -3
- cudf_polars/experimental/distinct.py +2 -0
- cudf_polars/experimental/explain.py +15 -2
- cudf_polars/experimental/expressions.py +30 -15
- cudf_polars/experimental/groupby.py +25 -4
- cudf_polars/experimental/io.py +156 -124
- cudf_polars/experimental/join.py +53 -23
- cudf_polars/experimental/parallel.py +68 -19
- cudf_polars/experimental/rapidsmpf/__init__.py +8 -0
- cudf_polars/experimental/rapidsmpf/collectives/__init__.py +9 -0
- cudf_polars/experimental/rapidsmpf/collectives/allgather.py +90 -0
- cudf_polars/experimental/rapidsmpf/collectives/common.py +96 -0
- cudf_polars/experimental/rapidsmpf/collectives/shuffle.py +253 -0
- cudf_polars/experimental/rapidsmpf/core.py +488 -0
- cudf_polars/experimental/rapidsmpf/dask.py +172 -0
- cudf_polars/experimental/rapidsmpf/dispatch.py +153 -0
- cudf_polars/experimental/rapidsmpf/io.py +696 -0
- cudf_polars/experimental/rapidsmpf/join.py +322 -0
- cudf_polars/experimental/rapidsmpf/lower.py +74 -0
- cudf_polars/experimental/rapidsmpf/nodes.py +735 -0
- cudf_polars/experimental/rapidsmpf/repartition.py +216 -0
- cudf_polars/experimental/rapidsmpf/union.py +115 -0
- cudf_polars/experimental/rapidsmpf/utils.py +374 -0
- cudf_polars/experimental/repartition.py +9 -2
- cudf_polars/experimental/select.py +177 -14
- cudf_polars/experimental/shuffle.py +46 -12
- cudf_polars/experimental/sort.py +100 -26
- cudf_polars/experimental/spilling.py +1 -1
- cudf_polars/experimental/statistics.py +24 -5
- cudf_polars/experimental/utils.py +25 -7
- cudf_polars/testing/asserts.py +13 -8
- cudf_polars/testing/io.py +2 -1
- cudf_polars/testing/plugin.py +93 -17
- cudf_polars/typing/__init__.py +86 -32
- cudf_polars/utils/config.py +473 -58
- cudf_polars/utils/cuda_stream.py +70 -0
- cudf_polars/utils/versions.py +5 -4
- cudf_polars_cu13-26.2.0.dist-info/METADATA +181 -0
- cudf_polars_cu13-26.2.0.dist-info/RECORD +108 -0
- {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/WHEEL +1 -1
- cudf_polars_cu13-25.10.0.dist-info/METADATA +0 -136
- cudf_polars_cu13-25.10.0.dist-info/RECORD +0 -92
- {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/licenses/LICENSE +0 -0
- {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,696 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
"""IO logic for the RapidsMPF streaming runtime."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
import dataclasses
|
|
9
|
+
import math
|
|
10
|
+
from typing import TYPE_CHECKING, Any
|
|
11
|
+
|
|
12
|
+
from rapidsmpf.streaming.core.message import Message
|
|
13
|
+
from rapidsmpf.streaming.cudf.table_chunk import TableChunk
|
|
14
|
+
|
|
15
|
+
import pylibcudf as plc
|
|
16
|
+
|
|
17
|
+
from cudf_polars.dsl.ir import (
|
|
18
|
+
IR,
|
|
19
|
+
DataFrameScan,
|
|
20
|
+
Scan,
|
|
21
|
+
_cast_literals_to_physical_types,
|
|
22
|
+
_parquet_physical_types,
|
|
23
|
+
)
|
|
24
|
+
from cudf_polars.dsl.to_ast import to_parquet_filter
|
|
25
|
+
from cudf_polars.experimental.base import (
|
|
26
|
+
IOPartitionFlavor,
|
|
27
|
+
IOPartitionPlan,
|
|
28
|
+
PartitionInfo,
|
|
29
|
+
)
|
|
30
|
+
from cudf_polars.experimental.io import SplitScan, scan_partition_plan
|
|
31
|
+
from cudf_polars.experimental.rapidsmpf.dispatch import (
|
|
32
|
+
generate_ir_sub_network,
|
|
33
|
+
lower_ir_node,
|
|
34
|
+
)
|
|
35
|
+
from cudf_polars.experimental.rapidsmpf.nodes import (
|
|
36
|
+
define_py_node,
|
|
37
|
+
metadata_feeder_node,
|
|
38
|
+
shutdown_on_error,
|
|
39
|
+
)
|
|
40
|
+
from cudf_polars.experimental.rapidsmpf.utils import (
|
|
41
|
+
ChannelManager,
|
|
42
|
+
Metadata,
|
|
43
|
+
opaque_reservation,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
if TYPE_CHECKING:
|
|
47
|
+
from collections.abc import MutableMapping
|
|
48
|
+
|
|
49
|
+
from rapidsmpf.streaming.core.channel import Channel
|
|
50
|
+
from rapidsmpf.streaming.core.context import Context
|
|
51
|
+
|
|
52
|
+
from cudf_polars.dsl.ir import IR, IRExecutionContext
|
|
53
|
+
from cudf_polars.experimental.base import ColumnStat, StatsCollector
|
|
54
|
+
from cudf_polars.experimental.rapidsmpf.core import SubNetGenerator
|
|
55
|
+
from cudf_polars.experimental.rapidsmpf.dispatch import LowerIRTransformer
|
|
56
|
+
from cudf_polars.experimental.rapidsmpf.utils import ChannelPair
|
|
57
|
+
from cudf_polars.utils.config import ParquetOptions
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class Lineariser:
|
|
61
|
+
"""
|
|
62
|
+
Linearizer that ensures ordered delivery from multiple concurrent producers.
|
|
63
|
+
|
|
64
|
+
Creates one input channel per producer and streams messages to output
|
|
65
|
+
in sequence-number order, buffering only out-of-order arrivals.
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
def __init__(
|
|
69
|
+
self, context: Context, ch_out: Channel[TableChunk], num_producers: int
|
|
70
|
+
):
|
|
71
|
+
self.context = context
|
|
72
|
+
self.ch_out = ch_out
|
|
73
|
+
self.num_producers = num_producers
|
|
74
|
+
self.input_channels = [context.create_channel() for _ in range(num_producers)]
|
|
75
|
+
|
|
76
|
+
async def drain(self) -> None:
|
|
77
|
+
"""
|
|
78
|
+
Drain producer channels and forward messages in sequence-number order.
|
|
79
|
+
|
|
80
|
+
Streams messages to output as soon as they arrive in order, buffering
|
|
81
|
+
only out-of-order messages to minimize memory pressure.
|
|
82
|
+
"""
|
|
83
|
+
next_seq = 0
|
|
84
|
+
buffer = {}
|
|
85
|
+
|
|
86
|
+
pending_tasks = {
|
|
87
|
+
asyncio.create_task(ch.recv(self.context)): ch for ch in self.input_channels
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
while pending_tasks:
|
|
91
|
+
done, _ = await asyncio.wait(
|
|
92
|
+
pending_tasks.keys(), return_when=asyncio.FIRST_COMPLETED
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
for task in done:
|
|
96
|
+
ch = pending_tasks.pop(task)
|
|
97
|
+
msg = await task
|
|
98
|
+
|
|
99
|
+
if msg is not None:
|
|
100
|
+
buffer[msg.sequence_number] = msg
|
|
101
|
+
new_task = asyncio.create_task(ch.recv(self.context))
|
|
102
|
+
pending_tasks[new_task] = ch
|
|
103
|
+
|
|
104
|
+
# Forward consecutive messages
|
|
105
|
+
while next_seq in buffer:
|
|
106
|
+
await self.ch_out.send(self.context, buffer.pop(next_seq))
|
|
107
|
+
next_seq += 1
|
|
108
|
+
|
|
109
|
+
# Forward any remaining buffered messages
|
|
110
|
+
for seq in sorted(buffer.keys()):
|
|
111
|
+
await self.ch_out.send(self.context, buffer.pop(seq))
|
|
112
|
+
|
|
113
|
+
await self.ch_out.drain(self.context)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@lower_ir_node.register(DataFrameScan)
|
|
117
|
+
def _(
|
|
118
|
+
ir: DataFrameScan, rec: LowerIRTransformer
|
|
119
|
+
) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
|
|
120
|
+
config_options = rec.state["config_options"]
|
|
121
|
+
assert config_options.executor.name == "streaming", (
|
|
122
|
+
"'in-memory' executor not supported in 'lower_ir_node_rapidsmpf'"
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# NOTE: We calculate the expected partition count
|
|
126
|
+
# to help trigger fallback warnings in lower_ir_graph.
|
|
127
|
+
# The generate_ir_sub_network logic is NOT required
|
|
128
|
+
# to obey this partition count. However, the count
|
|
129
|
+
# WILL match after an IO operation (for now).
|
|
130
|
+
rows_per_partition = config_options.executor.max_rows_per_partition
|
|
131
|
+
nrows = max(ir.df.shape()[0], 1)
|
|
132
|
+
count = math.ceil(nrows / rows_per_partition)
|
|
133
|
+
|
|
134
|
+
return ir, {ir: PartitionInfo(count=count)}
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
@define_py_node()
|
|
138
|
+
async def dataframescan_node(
|
|
139
|
+
context: Context,
|
|
140
|
+
ir: DataFrameScan,
|
|
141
|
+
ir_context: IRExecutionContext,
|
|
142
|
+
ch_out: ChannelPair,
|
|
143
|
+
*,
|
|
144
|
+
num_producers: int,
|
|
145
|
+
rows_per_partition: int,
|
|
146
|
+
estimated_chunk_bytes: int,
|
|
147
|
+
) -> None:
|
|
148
|
+
"""
|
|
149
|
+
DataFrameScan node for rapidsmpf.
|
|
150
|
+
|
|
151
|
+
Parameters
|
|
152
|
+
----------
|
|
153
|
+
context
|
|
154
|
+
The rapidsmpf context.
|
|
155
|
+
ir
|
|
156
|
+
The DataFrameScan node.
|
|
157
|
+
ir_context
|
|
158
|
+
The execution context for the IR node.
|
|
159
|
+
ch_out
|
|
160
|
+
The output ChannelPair.
|
|
161
|
+
num_producers
|
|
162
|
+
The number of producers to use for the DataFrameScan node.
|
|
163
|
+
rows_per_partition
|
|
164
|
+
The number of rows per partition.
|
|
165
|
+
estimated_chunk_bytes
|
|
166
|
+
Estimated size of each chunk in bytes. Used for memory reservation
|
|
167
|
+
with block spilling to avoid thrashing.
|
|
168
|
+
"""
|
|
169
|
+
async with shutdown_on_error(context, ch_out.metadata, ch_out.data):
|
|
170
|
+
# Find local partition count.
|
|
171
|
+
nrows = ir.df.shape()[0]
|
|
172
|
+
global_count = math.ceil(nrows / rows_per_partition) if nrows > 0 else 0
|
|
173
|
+
|
|
174
|
+
# For single rank, simplify the logic
|
|
175
|
+
if context.comm().nranks == 1:
|
|
176
|
+
local_count = global_count
|
|
177
|
+
local_offset = 0
|
|
178
|
+
else:
|
|
179
|
+
local_count = math.ceil(global_count / context.comm().nranks)
|
|
180
|
+
local_offset = local_count * context.comm().rank
|
|
181
|
+
|
|
182
|
+
# Send basic metadata
|
|
183
|
+
await ch_out.send_metadata(context, Metadata(max(1, local_count)))
|
|
184
|
+
|
|
185
|
+
# Build list of IR slices to read
|
|
186
|
+
ir_slices = []
|
|
187
|
+
for seq_num in range(local_count):
|
|
188
|
+
offset = local_offset * rows_per_partition + seq_num * rows_per_partition
|
|
189
|
+
if offset >= nrows:
|
|
190
|
+
break
|
|
191
|
+
ir_slices.append(
|
|
192
|
+
DataFrameScan(
|
|
193
|
+
ir.schema,
|
|
194
|
+
ir.df.slice(offset, rows_per_partition),
|
|
195
|
+
ir.projection,
|
|
196
|
+
)
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
# If there are no slices, drain the channel and return
|
|
200
|
+
if len(ir_slices) == 0:
|
|
201
|
+
await ch_out.data.drain(context)
|
|
202
|
+
return
|
|
203
|
+
|
|
204
|
+
# If there is only one ir_slices or one producer, we can
|
|
205
|
+
# skip the lineariser and read the chunks directly
|
|
206
|
+
if len(ir_slices) == 1 or num_producers == 1:
|
|
207
|
+
for seq_num, ir_slice in enumerate(ir_slices):
|
|
208
|
+
await read_chunk(
|
|
209
|
+
context,
|
|
210
|
+
ir_slice,
|
|
211
|
+
seq_num,
|
|
212
|
+
ch_out.data,
|
|
213
|
+
ir_context,
|
|
214
|
+
estimated_chunk_bytes,
|
|
215
|
+
)
|
|
216
|
+
await ch_out.data.drain(context)
|
|
217
|
+
return
|
|
218
|
+
|
|
219
|
+
# Use Lineariser to ensure ordered delivery
|
|
220
|
+
num_producers = min(num_producers, len(ir_slices))
|
|
221
|
+
lineariser = Lineariser(context, ch_out.data, num_producers)
|
|
222
|
+
|
|
223
|
+
# Assign tasks to producers using round-robin
|
|
224
|
+
producer_tasks: list[list[tuple[int, DataFrameScan]]] = [
|
|
225
|
+
[] for _ in range(num_producers)
|
|
226
|
+
]
|
|
227
|
+
for task_idx, ir_slice in enumerate(ir_slices):
|
|
228
|
+
producer_id = task_idx % num_producers
|
|
229
|
+
producer_tasks[producer_id].append((task_idx, ir_slice))
|
|
230
|
+
|
|
231
|
+
async def _producer(producer_id: int, ch_out: Channel) -> None:
|
|
232
|
+
for task_idx, ir_slice in producer_tasks[producer_id]:
|
|
233
|
+
await read_chunk(
|
|
234
|
+
context,
|
|
235
|
+
ir_slice,
|
|
236
|
+
task_idx,
|
|
237
|
+
ch_out,
|
|
238
|
+
ir_context,
|
|
239
|
+
estimated_chunk_bytes,
|
|
240
|
+
)
|
|
241
|
+
await ch_out.drain(context)
|
|
242
|
+
|
|
243
|
+
tasks = [lineariser.drain()]
|
|
244
|
+
tasks.extend(
|
|
245
|
+
_producer(i, ch_in) for i, ch_in in enumerate(lineariser.input_channels)
|
|
246
|
+
)
|
|
247
|
+
await asyncio.gather(*tasks)
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
@generate_ir_sub_network.register(DataFrameScan)
|
|
251
|
+
def _(
|
|
252
|
+
ir: DataFrameScan, rec: SubNetGenerator
|
|
253
|
+
) -> tuple[dict[IR, list[Any]], dict[IR, ChannelManager]]:
|
|
254
|
+
config_options = rec.state["config_options"]
|
|
255
|
+
assert config_options.executor.name == "streaming", (
|
|
256
|
+
"'in-memory' executor not supported in 'generate_ir_sub_network'"
|
|
257
|
+
)
|
|
258
|
+
rows_per_partition = config_options.executor.max_rows_per_partition
|
|
259
|
+
num_producers = rec.state["max_io_threads"]
|
|
260
|
+
# Use target_partition_size as the estimated chunk size
|
|
261
|
+
estimated_chunk_bytes = config_options.executor.target_partition_size
|
|
262
|
+
|
|
263
|
+
context = rec.state["context"]
|
|
264
|
+
ir_context = rec.state["ir_context"]
|
|
265
|
+
channels: dict[IR, ChannelManager] = {ir: ChannelManager(rec.state["context"])}
|
|
266
|
+
nodes: dict[IR, list[Any]] = {
|
|
267
|
+
ir: [
|
|
268
|
+
dataframescan_node(
|
|
269
|
+
context,
|
|
270
|
+
ir,
|
|
271
|
+
ir_context,
|
|
272
|
+
channels[ir].reserve_input_slot(),
|
|
273
|
+
num_producers=num_producers,
|
|
274
|
+
rows_per_partition=rows_per_partition,
|
|
275
|
+
estimated_chunk_bytes=estimated_chunk_bytes,
|
|
276
|
+
)
|
|
277
|
+
]
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
return nodes, channels
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
@lower_ir_node.register(Scan)
|
|
284
|
+
def _(
|
|
285
|
+
ir: Scan, rec: LowerIRTransformer
|
|
286
|
+
) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
|
|
287
|
+
config_options = rec.state["config_options"]
|
|
288
|
+
if (
|
|
289
|
+
ir.typ in ("csv", "parquet", "ndjson")
|
|
290
|
+
and ir.n_rows == -1
|
|
291
|
+
and ir.skip_rows == 0
|
|
292
|
+
and ir.row_index is None
|
|
293
|
+
):
|
|
294
|
+
# NOTE: We calculate the expected partition count
|
|
295
|
+
# to help trigger fallback warnings in lower_ir_graph.
|
|
296
|
+
# The generate_ir_sub_network logic is NOT required
|
|
297
|
+
# to obey this partition count. However, the count
|
|
298
|
+
# WILL match after an IO operation (for now).
|
|
299
|
+
plan = scan_partition_plan(ir, rec.state["stats"], config_options)
|
|
300
|
+
paths = list(ir.paths)
|
|
301
|
+
if plan.flavor == IOPartitionFlavor.SPLIT_FILES:
|
|
302
|
+
count = plan.factor * len(paths)
|
|
303
|
+
else:
|
|
304
|
+
count = math.ceil(len(paths) / plan.factor)
|
|
305
|
+
|
|
306
|
+
return ir, {ir: PartitionInfo(count=count, io_plan=plan)}
|
|
307
|
+
else:
|
|
308
|
+
plan = IOPartitionPlan(
|
|
309
|
+
flavor=IOPartitionFlavor.SINGLE_READ, factor=len(ir.paths)
|
|
310
|
+
)
|
|
311
|
+
return ir, {ir: PartitionInfo(count=1, io_plan=plan)}
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
async def read_chunk(
|
|
315
|
+
context: Context,
|
|
316
|
+
scan: IR,
|
|
317
|
+
seq_num: int,
|
|
318
|
+
ch_out: Channel[TableChunk],
|
|
319
|
+
ir_context: IRExecutionContext,
|
|
320
|
+
estimated_chunk_bytes: int,
|
|
321
|
+
) -> None:
|
|
322
|
+
"""
|
|
323
|
+
Read a chunk from disk and send it to the output channel.
|
|
324
|
+
|
|
325
|
+
Parameters
|
|
326
|
+
----------
|
|
327
|
+
context
|
|
328
|
+
The rapidsmpf context.
|
|
329
|
+
scan
|
|
330
|
+
The Scan or DataFrameScan node.
|
|
331
|
+
seq_num
|
|
332
|
+
The sequence number.
|
|
333
|
+
ch_out
|
|
334
|
+
The output channel.
|
|
335
|
+
ir_context
|
|
336
|
+
The execution context for the IR node.
|
|
337
|
+
estimated_chunk_bytes
|
|
338
|
+
Estimated size of the chunk in bytes. Used for memory reservation
|
|
339
|
+
with block spilling to avoid thrashing.
|
|
340
|
+
"""
|
|
341
|
+
with opaque_reservation(context, estimated_chunk_bytes):
|
|
342
|
+
df = await asyncio.to_thread(
|
|
343
|
+
scan.do_evaluate,
|
|
344
|
+
*scan._non_child_args,
|
|
345
|
+
context=ir_context,
|
|
346
|
+
)
|
|
347
|
+
await ch_out.send(
|
|
348
|
+
context,
|
|
349
|
+
Message(
|
|
350
|
+
seq_num,
|
|
351
|
+
TableChunk.from_pylibcudf_table(
|
|
352
|
+
df.table,
|
|
353
|
+
df.stream,
|
|
354
|
+
exclusive_view=True,
|
|
355
|
+
),
|
|
356
|
+
),
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
@define_py_node()
|
|
361
|
+
async def scan_node(
|
|
362
|
+
context: Context,
|
|
363
|
+
ir: Scan,
|
|
364
|
+
ir_context: IRExecutionContext,
|
|
365
|
+
ch_out: ChannelPair,
|
|
366
|
+
*,
|
|
367
|
+
num_producers: int,
|
|
368
|
+
plan: IOPartitionPlan,
|
|
369
|
+
parquet_options: ParquetOptions,
|
|
370
|
+
estimated_chunk_bytes: int,
|
|
371
|
+
) -> None:
|
|
372
|
+
"""
|
|
373
|
+
Scan node for rapidsmpf.
|
|
374
|
+
|
|
375
|
+
Parameters
|
|
376
|
+
----------
|
|
377
|
+
context
|
|
378
|
+
The rapidsmpf context.
|
|
379
|
+
ir
|
|
380
|
+
The Scan node.
|
|
381
|
+
ir_context
|
|
382
|
+
The execution context for the IR node.
|
|
383
|
+
ch_out
|
|
384
|
+
The output ChannelPair.
|
|
385
|
+
num_producers
|
|
386
|
+
The number of producers to use for the scan node.
|
|
387
|
+
plan
|
|
388
|
+
The partitioning plan.
|
|
389
|
+
parquet_options
|
|
390
|
+
The Parquet options.
|
|
391
|
+
estimated_chunk_bytes
|
|
392
|
+
Estimated size of each chunk in bytes. Used for memory reservation
|
|
393
|
+
with block spilling to avoid thrashing.
|
|
394
|
+
"""
|
|
395
|
+
async with shutdown_on_error(context, ch_out.metadata, ch_out.data):
|
|
396
|
+
# Build a list of local Scan operations
|
|
397
|
+
scans: list[Scan | SplitScan] = []
|
|
398
|
+
if plan.flavor == IOPartitionFlavor.SPLIT_FILES:
|
|
399
|
+
count = plan.factor * len(ir.paths)
|
|
400
|
+
local_count = math.ceil(count / context.comm().nranks)
|
|
401
|
+
local_offset = local_count * context.comm().rank
|
|
402
|
+
path_offset = local_offset // plan.factor
|
|
403
|
+
path_end = math.ceil((local_offset + local_count) / plan.factor)
|
|
404
|
+
path_count = path_end - path_offset
|
|
405
|
+
local_paths = ir.paths[path_offset : path_offset + path_count]
|
|
406
|
+
sindex = local_offset % plan.factor
|
|
407
|
+
splits_created = 0
|
|
408
|
+
for path in local_paths:
|
|
409
|
+
base_scan = Scan(
|
|
410
|
+
ir.schema,
|
|
411
|
+
ir.typ,
|
|
412
|
+
ir.reader_options,
|
|
413
|
+
ir.cloud_options,
|
|
414
|
+
[path],
|
|
415
|
+
ir.with_columns,
|
|
416
|
+
ir.skip_rows,
|
|
417
|
+
ir.n_rows,
|
|
418
|
+
ir.row_index,
|
|
419
|
+
ir.include_file_paths,
|
|
420
|
+
ir.predicate,
|
|
421
|
+
parquet_options,
|
|
422
|
+
)
|
|
423
|
+
while sindex < plan.factor and splits_created < local_count:
|
|
424
|
+
scans.append(
|
|
425
|
+
SplitScan(
|
|
426
|
+
ir.schema,
|
|
427
|
+
base_scan,
|
|
428
|
+
sindex,
|
|
429
|
+
plan.factor,
|
|
430
|
+
parquet_options,
|
|
431
|
+
)
|
|
432
|
+
)
|
|
433
|
+
sindex += 1
|
|
434
|
+
splits_created += 1
|
|
435
|
+
sindex = 0
|
|
436
|
+
|
|
437
|
+
else:
|
|
438
|
+
count = math.ceil(len(ir.paths) / plan.factor)
|
|
439
|
+
local_count = math.ceil(count / context.comm().nranks)
|
|
440
|
+
local_offset = local_count * context.comm().rank
|
|
441
|
+
paths_offset_start = local_offset * plan.factor
|
|
442
|
+
paths_offset_end = paths_offset_start + plan.factor * local_count
|
|
443
|
+
for offset in range(paths_offset_start, paths_offset_end, plan.factor):
|
|
444
|
+
local_paths = ir.paths[offset : offset + plan.factor]
|
|
445
|
+
if len(local_paths) > 0: # Only add scan if there are paths
|
|
446
|
+
scans.append(
|
|
447
|
+
Scan(
|
|
448
|
+
ir.schema,
|
|
449
|
+
ir.typ,
|
|
450
|
+
ir.reader_options,
|
|
451
|
+
ir.cloud_options,
|
|
452
|
+
local_paths,
|
|
453
|
+
ir.with_columns,
|
|
454
|
+
ir.skip_rows,
|
|
455
|
+
ir.n_rows,
|
|
456
|
+
ir.row_index,
|
|
457
|
+
ir.include_file_paths,
|
|
458
|
+
ir.predicate,
|
|
459
|
+
parquet_options,
|
|
460
|
+
)
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
# Send basic metadata
|
|
464
|
+
await ch_out.send_metadata(context, Metadata(max(1, len(scans))))
|
|
465
|
+
|
|
466
|
+
# If there is nothing to scan, drain the channel and return
|
|
467
|
+
if len(scans) == 0:
|
|
468
|
+
await ch_out.data.drain(context)
|
|
469
|
+
return
|
|
470
|
+
|
|
471
|
+
# If there is only one scan or one producer, we can
|
|
472
|
+
# skip the lineariser and read the chunks directly
|
|
473
|
+
if len(scans) == 1 or num_producers == 1:
|
|
474
|
+
for seq_num, scan in enumerate(scans):
|
|
475
|
+
await read_chunk(
|
|
476
|
+
context,
|
|
477
|
+
scan,
|
|
478
|
+
seq_num,
|
|
479
|
+
ch_out.data,
|
|
480
|
+
ir_context,
|
|
481
|
+
estimated_chunk_bytes,
|
|
482
|
+
)
|
|
483
|
+
await ch_out.data.drain(context)
|
|
484
|
+
return
|
|
485
|
+
|
|
486
|
+
# Use Lineariser to ensure ordered delivery
|
|
487
|
+
num_producers = min(num_producers, len(scans))
|
|
488
|
+
lineariser = Lineariser(context, ch_out.data, num_producers)
|
|
489
|
+
|
|
490
|
+
# Assign tasks to producers using round-robin
|
|
491
|
+
producer_tasks: list[list[tuple[int, Scan | SplitScan]]] = [
|
|
492
|
+
[] for _ in range(num_producers)
|
|
493
|
+
]
|
|
494
|
+
for task_idx, scan in enumerate(scans):
|
|
495
|
+
producer_id = task_idx % num_producers
|
|
496
|
+
producer_tasks[producer_id].append((task_idx, scan))
|
|
497
|
+
|
|
498
|
+
async def _producer(producer_id: int, ch_out: Channel) -> None:
|
|
499
|
+
for task_idx, scan in producer_tasks[producer_id]:
|
|
500
|
+
await read_chunk(
|
|
501
|
+
context,
|
|
502
|
+
scan,
|
|
503
|
+
task_idx,
|
|
504
|
+
ch_out,
|
|
505
|
+
ir_context,
|
|
506
|
+
estimated_chunk_bytes,
|
|
507
|
+
)
|
|
508
|
+
await ch_out.drain(context)
|
|
509
|
+
|
|
510
|
+
tasks = [lineariser.drain()]
|
|
511
|
+
tasks.extend(
|
|
512
|
+
_producer(i, ch_in) for i, ch_in in enumerate(lineariser.input_channels)
|
|
513
|
+
)
|
|
514
|
+
await asyncio.gather(*tasks)
|
|
515
|
+
|
|
516
|
+
|
|
517
|
+
def make_rapidsmpf_read_parquet_node(
|
|
518
|
+
context: Context,
|
|
519
|
+
ir: Scan,
|
|
520
|
+
num_producers: int,
|
|
521
|
+
ch_out: ChannelPair,
|
|
522
|
+
stats: StatsCollector,
|
|
523
|
+
partition_info: PartitionInfo,
|
|
524
|
+
) -> Any | None:
|
|
525
|
+
"""
|
|
526
|
+
Make a RapidsMPF read parquet node.
|
|
527
|
+
|
|
528
|
+
Parameters
|
|
529
|
+
----------
|
|
530
|
+
context
|
|
531
|
+
The rapidsmpf context.
|
|
532
|
+
ir
|
|
533
|
+
The Scan node.
|
|
534
|
+
num_producers
|
|
535
|
+
The number of producers to use for the scan node.
|
|
536
|
+
ch_out
|
|
537
|
+
The output ChannelPair.
|
|
538
|
+
stats
|
|
539
|
+
The statistics collector.
|
|
540
|
+
partition_info
|
|
541
|
+
The partition information.
|
|
542
|
+
|
|
543
|
+
Returns
|
|
544
|
+
-------
|
|
545
|
+
The RapidsMPF read parquet node, or None if the predicate cannot be
|
|
546
|
+
converted to a parquet filter (caller should fall back to scan_node).
|
|
547
|
+
"""
|
|
548
|
+
from rapidsmpf.streaming.cudf.parquet import Filter, read_parquet
|
|
549
|
+
|
|
550
|
+
# Build ParquetReaderOptions
|
|
551
|
+
try:
|
|
552
|
+
stream = context.get_stream_from_pool()
|
|
553
|
+
parquet_reader_options = plc.io.parquet.ParquetReaderOptions.builder(
|
|
554
|
+
plc.io.SourceInfo(ir.paths)
|
|
555
|
+
).build()
|
|
556
|
+
|
|
557
|
+
if ir.with_columns is not None:
|
|
558
|
+
parquet_reader_options.set_columns(ir.with_columns)
|
|
559
|
+
|
|
560
|
+
# Build predicate filter if present (passed separately to read_parquet)
|
|
561
|
+
filter_obj = None
|
|
562
|
+
if ir.predicate is not None:
|
|
563
|
+
filter_expr = to_parquet_filter(
|
|
564
|
+
_cast_literals_to_physical_types(
|
|
565
|
+
ir.predicate.value,
|
|
566
|
+
_parquet_physical_types(
|
|
567
|
+
ir.schema,
|
|
568
|
+
ir.paths,
|
|
569
|
+
ir.with_columns or list(ir.schema.keys()),
|
|
570
|
+
stream,
|
|
571
|
+
),
|
|
572
|
+
),
|
|
573
|
+
stream=stream,
|
|
574
|
+
)
|
|
575
|
+
if filter_expr is None:
|
|
576
|
+
# Predicate cannot be converted to parquet filter
|
|
577
|
+
# Return None to signal fallback to scan_node
|
|
578
|
+
return None
|
|
579
|
+
filter_obj = Filter(stream, filter_expr)
|
|
580
|
+
except Exception as e:
|
|
581
|
+
raise ValueError(f"Failed to build ParquetReaderOptions: {e}") from e
|
|
582
|
+
|
|
583
|
+
# Calculate num_rows_per_chunk from statistics
|
|
584
|
+
# Default to a reasonable chunk size if statistics are unavailable
|
|
585
|
+
estimated_row_count: ColumnStat[int] | None = stats.row_count.get(ir)
|
|
586
|
+
if estimated_row_count is None:
|
|
587
|
+
for cs in stats.column_stats.get(ir, {}).values():
|
|
588
|
+
if cs.source_info.row_count.value is not None:
|
|
589
|
+
estimated_row_count = cs.source_info.row_count
|
|
590
|
+
break
|
|
591
|
+
if estimated_row_count is not None and estimated_row_count.value is not None:
|
|
592
|
+
num_rows_per_chunk = int(
|
|
593
|
+
max(1, estimated_row_count.value // partition_info.count)
|
|
594
|
+
)
|
|
595
|
+
else:
|
|
596
|
+
# Fallback: use a default chunk size if statistics are not available
|
|
597
|
+
num_rows_per_chunk = 1_000_000 # 1 million rows as default
|
|
598
|
+
|
|
599
|
+
# Validate inputs
|
|
600
|
+
if num_rows_per_chunk <= 0:
|
|
601
|
+
raise ValueError(f"Invalid num_rows_per_chunk: {num_rows_per_chunk}")
|
|
602
|
+
if num_producers <= 0:
|
|
603
|
+
raise ValueError(f"Invalid num_producers: {num_producers}")
|
|
604
|
+
|
|
605
|
+
try:
|
|
606
|
+
return read_parquet(
|
|
607
|
+
context,
|
|
608
|
+
ch_out.data,
|
|
609
|
+
num_producers,
|
|
610
|
+
parquet_reader_options,
|
|
611
|
+
num_rows_per_chunk,
|
|
612
|
+
filter=filter_obj,
|
|
613
|
+
)
|
|
614
|
+
except Exception as e:
|
|
615
|
+
raise RuntimeError(
|
|
616
|
+
f"Failed to create read_parquet node: {e}\n"
|
|
617
|
+
f" paths: {ir.paths}\n"
|
|
618
|
+
f" num_producers: {num_producers}\n"
|
|
619
|
+
f" num_rows_per_chunk: {num_rows_per_chunk}\n"
|
|
620
|
+
f" partition_count: {partition_info.count}\n"
|
|
621
|
+
f" filter: {filter_obj}"
|
|
622
|
+
) from e
|
|
623
|
+
|
|
624
|
+
|
|
625
|
+
@generate_ir_sub_network.register(Scan)
|
|
626
|
+
def _(
|
|
627
|
+
ir: Scan, rec: SubNetGenerator
|
|
628
|
+
) -> tuple[dict[IR, list[Any]], dict[IR, ChannelManager]]:
|
|
629
|
+
config_options = rec.state["config_options"]
|
|
630
|
+
executor = rec.state["config_options"].executor
|
|
631
|
+
assert executor.name == "streaming", (
|
|
632
|
+
"'in-memory' executor not supported in 'generate_ir_sub_network'"
|
|
633
|
+
)
|
|
634
|
+
parquet_options = config_options.parquet_options
|
|
635
|
+
partition_info = rec.state["partition_info"][ir]
|
|
636
|
+
num_producers = rec.state["max_io_threads"]
|
|
637
|
+
channels: dict[IR, ChannelManager] = {ir: ChannelManager(rec.state["context"])}
|
|
638
|
+
|
|
639
|
+
assert partition_info.io_plan is not None, "Scan node must have a partition plan"
|
|
640
|
+
plan: IOPartitionPlan = partition_info.io_plan
|
|
641
|
+
|
|
642
|
+
# Native node cannot split large files in distributed mode yet
|
|
643
|
+
distributed_split_files = (
|
|
644
|
+
plan.flavor == IOPartitionFlavor.SPLIT_FILES
|
|
645
|
+
and rec.state["context"].comm().nranks > 1
|
|
646
|
+
)
|
|
647
|
+
|
|
648
|
+
# Use rapidsmpf native read_parquet node if possible
|
|
649
|
+
ch_pair = channels[ir].reserve_input_slot()
|
|
650
|
+
nodes: dict[IR, list[Any]] = {}
|
|
651
|
+
native_node: Any = None
|
|
652
|
+
if (
|
|
653
|
+
parquet_options.use_rapidsmpf_native
|
|
654
|
+
and partition_info.count > 1
|
|
655
|
+
and ir.typ == "parquet"
|
|
656
|
+
and ir.row_index is None
|
|
657
|
+
and ir.include_file_paths is None
|
|
658
|
+
and ir.n_rows == -1
|
|
659
|
+
and ir.skip_rows == 0
|
|
660
|
+
and not distributed_split_files
|
|
661
|
+
):
|
|
662
|
+
native_node = make_rapidsmpf_read_parquet_node(
|
|
663
|
+
rec.state["context"],
|
|
664
|
+
ir,
|
|
665
|
+
num_producers,
|
|
666
|
+
ch_pair,
|
|
667
|
+
rec.state["stats"],
|
|
668
|
+
partition_info,
|
|
669
|
+
)
|
|
670
|
+
|
|
671
|
+
if native_node is not None:
|
|
672
|
+
# Need metadata node, because the native read_parquet
|
|
673
|
+
# node does not send metadata.
|
|
674
|
+
metadata_node = metadata_feeder_node(
|
|
675
|
+
rec.state["context"],
|
|
676
|
+
ch_pair,
|
|
677
|
+
Metadata(partition_info.count),
|
|
678
|
+
)
|
|
679
|
+
nodes[ir] = [metadata_node, native_node]
|
|
680
|
+
else:
|
|
681
|
+
# Fall back to scan_node (predicate not convertible, or other constraint)
|
|
682
|
+
parquet_options = dataclasses.replace(parquet_options, chunked=False)
|
|
683
|
+
|
|
684
|
+
nodes[ir] = [
|
|
685
|
+
scan_node(
|
|
686
|
+
rec.state["context"],
|
|
687
|
+
ir,
|
|
688
|
+
rec.state["ir_context"],
|
|
689
|
+
ch_pair,
|
|
690
|
+
num_producers=num_producers,
|
|
691
|
+
plan=plan,
|
|
692
|
+
parquet_options=parquet_options,
|
|
693
|
+
estimated_chunk_bytes=executor.target_partition_size,
|
|
694
|
+
)
|
|
695
|
+
]
|
|
696
|
+
return nodes, channels
|