cudf-polars-cu12 25.2.2__py3-none-any.whl → 25.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudf_polars/VERSION +1 -1
- cudf_polars/callback.py +82 -65
- cudf_polars/containers/column.py +138 -7
- cudf_polars/containers/dataframe.py +26 -39
- cudf_polars/dsl/expr.py +3 -1
- cudf_polars/dsl/expressions/aggregation.py +27 -63
- cudf_polars/dsl/expressions/base.py +40 -72
- cudf_polars/dsl/expressions/binaryop.py +5 -41
- cudf_polars/dsl/expressions/boolean.py +25 -53
- cudf_polars/dsl/expressions/datetime.py +97 -17
- cudf_polars/dsl/expressions/literal.py +27 -33
- cudf_polars/dsl/expressions/rolling.py +110 -9
- cudf_polars/dsl/expressions/selection.py +8 -26
- cudf_polars/dsl/expressions/slicing.py +47 -0
- cudf_polars/dsl/expressions/sorting.py +5 -18
- cudf_polars/dsl/expressions/string.py +33 -36
- cudf_polars/dsl/expressions/ternary.py +3 -10
- cudf_polars/dsl/expressions/unary.py +35 -75
- cudf_polars/dsl/ir.py +749 -212
- cudf_polars/dsl/nodebase.py +8 -1
- cudf_polars/dsl/to_ast.py +5 -3
- cudf_polars/dsl/translate.py +319 -171
- cudf_polars/dsl/utils/__init__.py +8 -0
- cudf_polars/dsl/utils/aggregations.py +292 -0
- cudf_polars/dsl/utils/groupby.py +97 -0
- cudf_polars/dsl/utils/naming.py +34 -0
- cudf_polars/dsl/utils/replace.py +46 -0
- cudf_polars/dsl/utils/rolling.py +113 -0
- cudf_polars/dsl/utils/windows.py +186 -0
- cudf_polars/experimental/base.py +17 -19
- cudf_polars/experimental/benchmarks/__init__.py +4 -0
- cudf_polars/experimental/benchmarks/pdsh.py +1279 -0
- cudf_polars/experimental/dask_registers.py +196 -0
- cudf_polars/experimental/distinct.py +174 -0
- cudf_polars/experimental/explain.py +127 -0
- cudf_polars/experimental/expressions.py +521 -0
- cudf_polars/experimental/groupby.py +288 -0
- cudf_polars/experimental/io.py +58 -29
- cudf_polars/experimental/join.py +353 -0
- cudf_polars/experimental/parallel.py +166 -93
- cudf_polars/experimental/repartition.py +69 -0
- cudf_polars/experimental/scheduler.py +155 -0
- cudf_polars/experimental/select.py +92 -7
- cudf_polars/experimental/shuffle.py +294 -0
- cudf_polars/experimental/sort.py +45 -0
- cudf_polars/experimental/spilling.py +151 -0
- cudf_polars/experimental/utils.py +100 -0
- cudf_polars/testing/asserts.py +146 -6
- cudf_polars/testing/io.py +72 -0
- cudf_polars/testing/plugin.py +78 -76
- cudf_polars/typing/__init__.py +59 -6
- cudf_polars/utils/config.py +353 -0
- cudf_polars/utils/conversion.py +40 -0
- cudf_polars/utils/dtypes.py +22 -5
- cudf_polars/utils/timer.py +39 -0
- cudf_polars/utils/versions.py +5 -4
- {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/METADATA +10 -7
- cudf_polars_cu12-25.6.0.dist-info/RECORD +73 -0
- {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/WHEEL +1 -1
- cudf_polars/experimental/dask_serialize.py +0 -59
- cudf_polars_cu12-25.2.2.dist-info/RECORD +0 -48
- {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info/licenses}/LICENSE +0 -0
- {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
"""Spilling in multi-partition Dask execution using RAPIDSMPF."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import TYPE_CHECKING, Any
|
|
8
|
+
|
|
9
|
+
from dask.sizeof import sizeof
|
|
10
|
+
from distributed import get_worker
|
|
11
|
+
from rapidsmpf.buffer.buffer import MemoryType
|
|
12
|
+
from rapidsmpf.integrations.dask.core import get_worker_context
|
|
13
|
+
from rapidsmpf.integrations.dask.spilling import SpillableWrapper
|
|
14
|
+
|
|
15
|
+
from cudf_polars.containers import DataFrame
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from collections.abc import Callable, MutableMapping
|
|
19
|
+
from typing import Any
|
|
20
|
+
|
|
21
|
+
from cudf_polars.utils.config import ConfigOptions
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def wrap_arg(obj: Any) -> Any:
|
|
25
|
+
"""
|
|
26
|
+
Make `obj` spillable if it is a DataFrame.
|
|
27
|
+
|
|
28
|
+
Parameters
|
|
29
|
+
----------
|
|
30
|
+
obj
|
|
31
|
+
The object to be wrapped (if it is a DataFrame).
|
|
32
|
+
|
|
33
|
+
Returns
|
|
34
|
+
-------
|
|
35
|
+
A SpillableWrapper if obj is a DataFrame, otherwise the original object.
|
|
36
|
+
"""
|
|
37
|
+
if isinstance(obj, DataFrame):
|
|
38
|
+
return SpillableWrapper(on_device=obj)
|
|
39
|
+
return obj
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def unwrap_arg(obj: Any) -> Any:
|
|
43
|
+
"""
|
|
44
|
+
Unwraps a SpillableWrapper to retrieve the original object.
|
|
45
|
+
|
|
46
|
+
Parameters
|
|
47
|
+
----------
|
|
48
|
+
obj
|
|
49
|
+
The object to be unwrapped.
|
|
50
|
+
|
|
51
|
+
Returns
|
|
52
|
+
-------
|
|
53
|
+
The unwrapped obj is a SpillableWrapper, otherwise the original object.
|
|
54
|
+
"""
|
|
55
|
+
if isinstance(obj, SpillableWrapper):
|
|
56
|
+
return obj.unspill()
|
|
57
|
+
return obj
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def wrap_func_spillable(
|
|
61
|
+
func: Callable,
|
|
62
|
+
*,
|
|
63
|
+
make_func_output_spillable: bool,
|
|
64
|
+
target_partition_size: int,
|
|
65
|
+
) -> Callable:
|
|
66
|
+
"""
|
|
67
|
+
Wraps a function to handle spillable DataFrames.
|
|
68
|
+
|
|
69
|
+
Parameters
|
|
70
|
+
----------
|
|
71
|
+
func
|
|
72
|
+
The function to be wrapped.
|
|
73
|
+
make_func_output_spillable
|
|
74
|
+
Whether to wrap the function's output in a SpillableWrapper.
|
|
75
|
+
target_partition_size
|
|
76
|
+
Target byte size for IO tasks.
|
|
77
|
+
|
|
78
|
+
Returns
|
|
79
|
+
-------
|
|
80
|
+
A wrapped function that processes spillable DataFrames.
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
def wrapper(*args: Any) -> Any:
|
|
84
|
+
# Make headroom before executing the task
|
|
85
|
+
headroom = 0
|
|
86
|
+
probable_io_task = True
|
|
87
|
+
for arg in args:
|
|
88
|
+
if isinstance(arg, SpillableWrapper):
|
|
89
|
+
if arg.mem_type() == MemoryType.HOST:
|
|
90
|
+
headroom += sizeof(arg._on_host)
|
|
91
|
+
probable_io_task = False
|
|
92
|
+
if probable_io_task:
|
|
93
|
+
# Likely an IO task - Assume we need target_partition_size
|
|
94
|
+
headroom = target_partition_size
|
|
95
|
+
if headroom > 128_000_000: # Don't waste time on smaller data
|
|
96
|
+
ctx = get_worker_context(get_worker())
|
|
97
|
+
with ctx.lock:
|
|
98
|
+
ctx.br.spill_manager.spill_to_make_headroom(headroom=headroom)
|
|
99
|
+
|
|
100
|
+
ret: Any = func(*(unwrap_arg(arg) for arg in args))
|
|
101
|
+
if make_func_output_spillable:
|
|
102
|
+
ret = wrap_arg(ret)
|
|
103
|
+
return ret
|
|
104
|
+
|
|
105
|
+
return wrapper
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def wrap_dataframe_in_spillable(
|
|
109
|
+
graph: MutableMapping[Any, Any],
|
|
110
|
+
ignore_key: str | tuple[str, int],
|
|
111
|
+
config_options: ConfigOptions,
|
|
112
|
+
) -> MutableMapping[Any, Any]:
|
|
113
|
+
"""
|
|
114
|
+
Wraps functions within a task graph to handle spillable DataFrames.
|
|
115
|
+
|
|
116
|
+
Only supports flat task graphs where each DataFrame can be found in the
|
|
117
|
+
outermost level. Currently, this is true for all cudf-polars task graphs.
|
|
118
|
+
|
|
119
|
+
Parameters
|
|
120
|
+
----------
|
|
121
|
+
graph
|
|
122
|
+
Task graph.
|
|
123
|
+
ignore_key
|
|
124
|
+
The key to ignore when wrapping function, typically the key of the
|
|
125
|
+
output node.
|
|
126
|
+
config_options
|
|
127
|
+
GPUEngine configuration options.
|
|
128
|
+
|
|
129
|
+
Returns
|
|
130
|
+
-------
|
|
131
|
+
A new task graph with wrapped functions.
|
|
132
|
+
"""
|
|
133
|
+
assert config_options.executor.name == "streaming", (
|
|
134
|
+
"'in-memory' executor not supported in 'wrap_dataframe_in_spillable'"
|
|
135
|
+
)
|
|
136
|
+
target_partition_size = config_options.executor.target_partition_size
|
|
137
|
+
|
|
138
|
+
ret = {}
|
|
139
|
+
for key, task in graph.items():
|
|
140
|
+
assert isinstance(task, tuple)
|
|
141
|
+
ret[key] = tuple(
|
|
142
|
+
wrap_func_spillable(
|
|
143
|
+
a,
|
|
144
|
+
make_func_output_spillable=key != ignore_key,
|
|
145
|
+
target_partition_size=target_partition_size,
|
|
146
|
+
)
|
|
147
|
+
if callable(a)
|
|
148
|
+
else a
|
|
149
|
+
for a in task
|
|
150
|
+
)
|
|
151
|
+
return ret
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
"""Multi-partition utilities."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import operator
|
|
8
|
+
import warnings
|
|
9
|
+
from functools import reduce
|
|
10
|
+
from itertools import chain
|
|
11
|
+
from typing import TYPE_CHECKING
|
|
12
|
+
|
|
13
|
+
from cudf_polars.dsl.expr import Col
|
|
14
|
+
from cudf_polars.dsl.ir import Union
|
|
15
|
+
from cudf_polars.experimental.base import PartitionInfo
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from collections.abc import MutableMapping
|
|
19
|
+
|
|
20
|
+
from cudf_polars.containers import DataFrame
|
|
21
|
+
from cudf_polars.dsl.expr import Expr
|
|
22
|
+
from cudf_polars.dsl.ir import IR
|
|
23
|
+
from cudf_polars.experimental.dispatch import LowerIRTransformer
|
|
24
|
+
from cudf_polars.utils.config import ConfigOptions
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _concat(*dfs: DataFrame) -> DataFrame:
|
|
28
|
+
# Concatenate a sequence of DataFrames vertically
|
|
29
|
+
return Union.do_evaluate(None, *dfs)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _fallback_inform(msg: str, config_options: ConfigOptions) -> None:
|
|
33
|
+
"""Inform the user of single-partition fallback."""
|
|
34
|
+
# Satisfy type checking
|
|
35
|
+
assert config_options.executor.name == "streaming", (
|
|
36
|
+
"'in-memory' executor not supported in '_fallback_inform'"
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
match fallback_mode := config_options.executor.fallback_mode:
|
|
40
|
+
case "warn":
|
|
41
|
+
warnings.warn(msg, stacklevel=2)
|
|
42
|
+
case "raise":
|
|
43
|
+
raise NotImplementedError(msg)
|
|
44
|
+
case "silent":
|
|
45
|
+
pass
|
|
46
|
+
case _: # pragma: no cover; Should never get here.
|
|
47
|
+
raise ValueError(
|
|
48
|
+
f"{fallback_mode} is not a supported 'fallback_mode' "
|
|
49
|
+
"option. Please use 'warn', 'raise', or 'silent'."
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _lower_ir_fallback(
|
|
54
|
+
ir: IR,
|
|
55
|
+
rec: LowerIRTransformer,
|
|
56
|
+
*,
|
|
57
|
+
msg: str | None = None,
|
|
58
|
+
) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
|
|
59
|
+
# Catch-all single-partition lowering logic.
|
|
60
|
+
# If any children contain multiple partitions,
|
|
61
|
+
# those children will be collapsed with `Repartition`.
|
|
62
|
+
from cudf_polars.experimental.repartition import Repartition
|
|
63
|
+
|
|
64
|
+
# Lower children
|
|
65
|
+
lowered_children, _partition_info = zip(*(rec(c) for c in ir.children), strict=True)
|
|
66
|
+
partition_info = reduce(operator.or_, _partition_info)
|
|
67
|
+
|
|
68
|
+
# Ensure all children are single-partitioned
|
|
69
|
+
children = []
|
|
70
|
+
fallback = False
|
|
71
|
+
for c in lowered_children:
|
|
72
|
+
child = c
|
|
73
|
+
if partition_info[c].count > 1:
|
|
74
|
+
# Fall-back logic
|
|
75
|
+
fallback = True
|
|
76
|
+
child = Repartition(child.schema, child)
|
|
77
|
+
partition_info[child] = PartitionInfo(count=1)
|
|
78
|
+
children.append(child)
|
|
79
|
+
|
|
80
|
+
if fallback and msg:
|
|
81
|
+
# Warn/raise the user if any children were collapsed
|
|
82
|
+
# and the "fallback_mode" configuration is not "silent"
|
|
83
|
+
_fallback_inform(msg, rec.state["config_options"])
|
|
84
|
+
|
|
85
|
+
# Reconstruct and return
|
|
86
|
+
new_node = ir.reconstruct(children)
|
|
87
|
+
partition_info[new_node] = PartitionInfo(count=1)
|
|
88
|
+
return new_node, partition_info
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _leaf_column_names(expr: Expr) -> tuple[str, ...]:
|
|
92
|
+
"""Find the leaf column names of an expression."""
|
|
93
|
+
if expr.children:
|
|
94
|
+
return tuple(
|
|
95
|
+
chain.from_iterable(_leaf_column_names(child) for child in expr.children)
|
|
96
|
+
)
|
|
97
|
+
elif isinstance(expr, Col):
|
|
98
|
+
return (expr.name,)
|
|
99
|
+
else:
|
|
100
|
+
return ()
|
cudf_polars/testing/asserts.py
CHANGED
|
@@ -5,24 +5,30 @@
|
|
|
5
5
|
|
|
6
6
|
from __future__ import annotations
|
|
7
7
|
|
|
8
|
+
from pathlib import Path
|
|
8
9
|
from typing import TYPE_CHECKING
|
|
9
10
|
|
|
11
|
+
import polars as pl
|
|
10
12
|
from polars import GPUEngine
|
|
11
13
|
from polars.testing.asserts import assert_frame_equal
|
|
12
14
|
|
|
13
15
|
from cudf_polars.dsl.translate import Translator
|
|
14
16
|
|
|
15
17
|
if TYPE_CHECKING:
|
|
16
|
-
import polars as pl
|
|
17
|
-
|
|
18
18
|
from cudf_polars.typing import OptimizationArgs
|
|
19
19
|
|
|
20
|
-
__all__: list[str] = ["assert_gpu_result_equal", "assert_ir_translation_raises"]
|
|
21
20
|
|
|
21
|
+
__all__: list[str] = [
|
|
22
|
+
"assert_gpu_result_equal",
|
|
23
|
+
"assert_ir_translation_raises",
|
|
24
|
+
"assert_sink_ir_translation_raises",
|
|
25
|
+
"assert_sink_result_equal",
|
|
26
|
+
]
|
|
22
27
|
|
|
23
28
|
# Will be overriden by `conftest.py` with the value from the `--executor`
|
|
24
|
-
# command-line
|
|
25
|
-
|
|
29
|
+
# and `--scheduler` command-line arguments
|
|
30
|
+
DEFAULT_EXECUTOR = "in-memory"
|
|
31
|
+
DEFAULT_SCHEDULER = "synchronous"
|
|
26
32
|
|
|
27
33
|
|
|
28
34
|
def assert_gpu_result_equal(
|
|
@@ -89,7 +95,14 @@ def assert_gpu_result_equal(
|
|
|
89
95
|
If GPU collection failed in some way.
|
|
90
96
|
"""
|
|
91
97
|
if engine is None:
|
|
92
|
-
|
|
98
|
+
executor = executor or DEFAULT_EXECUTOR
|
|
99
|
+
engine = GPUEngine(
|
|
100
|
+
raise_on_fail=True,
|
|
101
|
+
executor=executor,
|
|
102
|
+
executor_options=(
|
|
103
|
+
{"scheduler": DEFAULT_SCHEDULER} if executor == "streaming" else {}
|
|
104
|
+
),
|
|
105
|
+
)
|
|
93
106
|
|
|
94
107
|
final_polars_collect_kwargs, final_cudf_collect_kwargs = _process_kwargs(
|
|
95
108
|
collect_kwargs, polars_collect_kwargs, cudf_collect_kwargs
|
|
@@ -236,3 +249,130 @@ def assert_collect_raises(
|
|
|
236
249
|
else:
|
|
237
250
|
if cudf_except != ():
|
|
238
251
|
raise AssertionError(f"GPU execution DID NOT RAISE {cudf_except}")
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def _resolve_sink_format(path: Path) -> str:
|
|
255
|
+
"""Returns valid sink format for assert utilities."""
|
|
256
|
+
suffix = path.suffix.lower()
|
|
257
|
+
supported_ext = {
|
|
258
|
+
".csv": "csv",
|
|
259
|
+
".pq": "parquet",
|
|
260
|
+
".parquet": "parquet",
|
|
261
|
+
".json": "ndjson",
|
|
262
|
+
".ndjson": "ndjson",
|
|
263
|
+
}
|
|
264
|
+
if suffix not in supported_ext:
|
|
265
|
+
raise ValueError(f"Unsupported file format: {suffix}")
|
|
266
|
+
return supported_ext[suffix]
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def assert_sink_result_equal(
|
|
270
|
+
lazydf: pl.LazyFrame,
|
|
271
|
+
path: str | Path,
|
|
272
|
+
*,
|
|
273
|
+
engine: str | GPUEngine | None = None,
|
|
274
|
+
read_kwargs: dict | None = None,
|
|
275
|
+
write_kwargs: dict | None = None,
|
|
276
|
+
executor: str | None = None,
|
|
277
|
+
) -> None:
|
|
278
|
+
"""
|
|
279
|
+
Assert that writing a LazyFrame via sink produces the same output.
|
|
280
|
+
|
|
281
|
+
Parameters
|
|
282
|
+
----------
|
|
283
|
+
lazydf
|
|
284
|
+
The LazyFrame to sink.
|
|
285
|
+
path
|
|
286
|
+
The file path to use. Suffix must be one of:
|
|
287
|
+
'.csv', '.parquet', '.pq', '.json', '.ndjson'.
|
|
288
|
+
engine
|
|
289
|
+
The GPU engine to use for the sink operation.
|
|
290
|
+
read_kwargs
|
|
291
|
+
Optional keyword arguments to pass to the corresponding `pl.read_*` function.
|
|
292
|
+
write_kwargs
|
|
293
|
+
Optional keyword arguments to pass to the corresponding `sink_*` function.
|
|
294
|
+
executor
|
|
295
|
+
The executor configuration to pass to `GPUEngine`. If not specified
|
|
296
|
+
uses the module level `Executor` attribute.
|
|
297
|
+
|
|
298
|
+
Raises
|
|
299
|
+
------
|
|
300
|
+
AssertionError
|
|
301
|
+
If the outputs from CPU and GPU sink differ.
|
|
302
|
+
ValueError
|
|
303
|
+
If the file extension is not one of the supported formats.
|
|
304
|
+
"""
|
|
305
|
+
if engine is None:
|
|
306
|
+
executor = executor or DEFAULT_EXECUTOR
|
|
307
|
+
engine = GPUEngine(
|
|
308
|
+
raise_on_fail=True,
|
|
309
|
+
executor=executor,
|
|
310
|
+
executor_options=(
|
|
311
|
+
{"scheduler": DEFAULT_SCHEDULER} if executor == "streaming" else {}
|
|
312
|
+
),
|
|
313
|
+
)
|
|
314
|
+
path = Path(path)
|
|
315
|
+
read_kwargs = read_kwargs or {}
|
|
316
|
+
write_kwargs = write_kwargs or {}
|
|
317
|
+
|
|
318
|
+
fmt = _resolve_sink_format(path)
|
|
319
|
+
|
|
320
|
+
cpu_path = path.with_name(f"{path.stem}_cpu{path.suffix}")
|
|
321
|
+
gpu_path = path.with_name(f"{path.stem}_gpu{path.suffix}")
|
|
322
|
+
|
|
323
|
+
sink_fn = getattr(lazydf, f"sink_{fmt}")
|
|
324
|
+
read_fn = getattr(pl, f"read_{fmt}")
|
|
325
|
+
|
|
326
|
+
sink_fn(cpu_path, **write_kwargs)
|
|
327
|
+
sink_fn(gpu_path, engine=engine, **write_kwargs)
|
|
328
|
+
|
|
329
|
+
expected = read_fn(cpu_path, **read_kwargs)
|
|
330
|
+
result = read_fn(gpu_path, **read_kwargs)
|
|
331
|
+
|
|
332
|
+
assert_frame_equal(expected, result)
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
def assert_sink_ir_translation_raises(
|
|
336
|
+
lazydf: pl.LazyFrame,
|
|
337
|
+
path: str | Path,
|
|
338
|
+
write_kwargs: dict,
|
|
339
|
+
*exceptions: type[Exception],
|
|
340
|
+
) -> None:
|
|
341
|
+
"""
|
|
342
|
+
Assert that translation of a sink query raises an exception.
|
|
343
|
+
|
|
344
|
+
Parameters
|
|
345
|
+
----------
|
|
346
|
+
lazydf
|
|
347
|
+
The LazyFrame to sink.
|
|
348
|
+
path
|
|
349
|
+
The file path. Must have one of the supported suffixes.
|
|
350
|
+
write_kwargs
|
|
351
|
+
Keyword arguments to pass to the `sink_*` method.
|
|
352
|
+
*exceptions
|
|
353
|
+
One or more expected exception types that should be raised during translation.
|
|
354
|
+
|
|
355
|
+
Raises
|
|
356
|
+
------
|
|
357
|
+
AssertionError
|
|
358
|
+
If translation does not raise any of the expected exceptions.
|
|
359
|
+
If an exception occurs before translation begins.
|
|
360
|
+
ValueError
|
|
361
|
+
If the file extension is not one of the supported formats.
|
|
362
|
+
"""
|
|
363
|
+
path = Path(path)
|
|
364
|
+
fmt = _resolve_sink_format(path)
|
|
365
|
+
|
|
366
|
+
try:
|
|
367
|
+
lazy_sink = getattr(lazydf, f"sink_{fmt}")(
|
|
368
|
+
path,
|
|
369
|
+
engine="gpu",
|
|
370
|
+
lazy=True,
|
|
371
|
+
**write_kwargs,
|
|
372
|
+
)
|
|
373
|
+
except Exception as e:
|
|
374
|
+
raise AssertionError(
|
|
375
|
+
f"Sink function raised an exception before translation: {e}"
|
|
376
|
+
) from e
|
|
377
|
+
|
|
378
|
+
assert_ir_translation_raises(lazy_sink, *exceptions)
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
"""IO testing utilities."""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import TYPE_CHECKING
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from typing import Literal
|
|
13
|
+
|
|
14
|
+
import polars as pl
|
|
15
|
+
|
|
16
|
+
__all__: list[str] = ["make_partitioned_source"]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def make_partitioned_source(
|
|
20
|
+
df: pl.DataFrame,
|
|
21
|
+
path: str | Path,
|
|
22
|
+
fmt: Literal["csv", "ndjson", "parquet", "chunked_parquet"],
|
|
23
|
+
*,
|
|
24
|
+
n_files: int = 1,
|
|
25
|
+
row_group_size: int | None = None,
|
|
26
|
+
write_kwargs: dict | None = None,
|
|
27
|
+
) -> None:
|
|
28
|
+
"""
|
|
29
|
+
Write the Polars DataFrame to one or more files of the desired format.
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
df : polars.DataFrame
|
|
34
|
+
The input DataFrame to write.
|
|
35
|
+
path : str | pathlib.Path
|
|
36
|
+
The base path to write the file(s) to.
|
|
37
|
+
fmt : Literal["csv", "ndjson", "parquet", "chunked_parquet"]
|
|
38
|
+
The format to write in.
|
|
39
|
+
n_files : int, default 1
|
|
40
|
+
If greater than 1, splits the data into multiple files.
|
|
41
|
+
row_group_size : optional, int
|
|
42
|
+
Only used for Parquet. Specifies the row group size per file.
|
|
43
|
+
write_kwargs : dict, optional
|
|
44
|
+
Additional keyword arguments to pass to the write_* functions.
|
|
45
|
+
"""
|
|
46
|
+
path = Path(path)
|
|
47
|
+
write_kwargs = write_kwargs or {}
|
|
48
|
+
|
|
49
|
+
def write(part: pl.DataFrame, file_path: Path) -> None:
|
|
50
|
+
match fmt:
|
|
51
|
+
case "csv":
|
|
52
|
+
part.write_csv(file_path, **write_kwargs)
|
|
53
|
+
case "ndjson":
|
|
54
|
+
part.write_ndjson(file_path, **write_kwargs)
|
|
55
|
+
case "parquet" | "chunked_parquet":
|
|
56
|
+
part.write_parquet(
|
|
57
|
+
file_path,
|
|
58
|
+
row_group_size=row_group_size or (len(part) // 2),
|
|
59
|
+
**write_kwargs,
|
|
60
|
+
)
|
|
61
|
+
case _:
|
|
62
|
+
raise ValueError(f"Unsupported format: {fmt}")
|
|
63
|
+
|
|
64
|
+
if n_files == 1:
|
|
65
|
+
if path.is_dir():
|
|
66
|
+
path = path / f"part.0.{fmt}"
|
|
67
|
+
write(df, path)
|
|
68
|
+
else:
|
|
69
|
+
stride = len(df) // n_files
|
|
70
|
+
for i, part in enumerate(df.iter_slices(stride)):
|
|
71
|
+
file_path = path / f"part.{i}.{fmt}"
|
|
72
|
+
write(part, file_path)
|