cudf-polars-cu12 25.2.2__py3-none-any.whl → 25.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. cudf_polars/VERSION +1 -1
  2. cudf_polars/callback.py +82 -65
  3. cudf_polars/containers/column.py +138 -7
  4. cudf_polars/containers/dataframe.py +26 -39
  5. cudf_polars/dsl/expr.py +3 -1
  6. cudf_polars/dsl/expressions/aggregation.py +27 -63
  7. cudf_polars/dsl/expressions/base.py +40 -72
  8. cudf_polars/dsl/expressions/binaryop.py +5 -41
  9. cudf_polars/dsl/expressions/boolean.py +25 -53
  10. cudf_polars/dsl/expressions/datetime.py +97 -17
  11. cudf_polars/dsl/expressions/literal.py +27 -33
  12. cudf_polars/dsl/expressions/rolling.py +110 -9
  13. cudf_polars/dsl/expressions/selection.py +8 -26
  14. cudf_polars/dsl/expressions/slicing.py +47 -0
  15. cudf_polars/dsl/expressions/sorting.py +5 -18
  16. cudf_polars/dsl/expressions/string.py +33 -36
  17. cudf_polars/dsl/expressions/ternary.py +3 -10
  18. cudf_polars/dsl/expressions/unary.py +35 -75
  19. cudf_polars/dsl/ir.py +749 -212
  20. cudf_polars/dsl/nodebase.py +8 -1
  21. cudf_polars/dsl/to_ast.py +5 -3
  22. cudf_polars/dsl/translate.py +319 -171
  23. cudf_polars/dsl/utils/__init__.py +8 -0
  24. cudf_polars/dsl/utils/aggregations.py +292 -0
  25. cudf_polars/dsl/utils/groupby.py +97 -0
  26. cudf_polars/dsl/utils/naming.py +34 -0
  27. cudf_polars/dsl/utils/replace.py +46 -0
  28. cudf_polars/dsl/utils/rolling.py +113 -0
  29. cudf_polars/dsl/utils/windows.py +186 -0
  30. cudf_polars/experimental/base.py +17 -19
  31. cudf_polars/experimental/benchmarks/__init__.py +4 -0
  32. cudf_polars/experimental/benchmarks/pdsh.py +1279 -0
  33. cudf_polars/experimental/dask_registers.py +196 -0
  34. cudf_polars/experimental/distinct.py +174 -0
  35. cudf_polars/experimental/explain.py +127 -0
  36. cudf_polars/experimental/expressions.py +521 -0
  37. cudf_polars/experimental/groupby.py +288 -0
  38. cudf_polars/experimental/io.py +58 -29
  39. cudf_polars/experimental/join.py +353 -0
  40. cudf_polars/experimental/parallel.py +166 -93
  41. cudf_polars/experimental/repartition.py +69 -0
  42. cudf_polars/experimental/scheduler.py +155 -0
  43. cudf_polars/experimental/select.py +92 -7
  44. cudf_polars/experimental/shuffle.py +294 -0
  45. cudf_polars/experimental/sort.py +45 -0
  46. cudf_polars/experimental/spilling.py +151 -0
  47. cudf_polars/experimental/utils.py +100 -0
  48. cudf_polars/testing/asserts.py +146 -6
  49. cudf_polars/testing/io.py +72 -0
  50. cudf_polars/testing/plugin.py +78 -76
  51. cudf_polars/typing/__init__.py +59 -6
  52. cudf_polars/utils/config.py +353 -0
  53. cudf_polars/utils/conversion.py +40 -0
  54. cudf_polars/utils/dtypes.py +22 -5
  55. cudf_polars/utils/timer.py +39 -0
  56. cudf_polars/utils/versions.py +5 -4
  57. {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/METADATA +10 -7
  58. cudf_polars_cu12-25.6.0.dist-info/RECORD +73 -0
  59. {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/WHEEL +1 -1
  60. cudf_polars/experimental/dask_serialize.py +0 -59
  61. cudf_polars_cu12-25.2.2.dist-info/RECORD +0 -48
  62. {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info/licenses}/LICENSE +0 -0
  63. {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,151 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ """Spilling in multi-partition Dask execution using RAPIDSMPF."""
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import TYPE_CHECKING, Any
8
+
9
+ from dask.sizeof import sizeof
10
+ from distributed import get_worker
11
+ from rapidsmpf.buffer.buffer import MemoryType
12
+ from rapidsmpf.integrations.dask.core import get_worker_context
13
+ from rapidsmpf.integrations.dask.spilling import SpillableWrapper
14
+
15
+ from cudf_polars.containers import DataFrame
16
+
17
+ if TYPE_CHECKING:
18
+ from collections.abc import Callable, MutableMapping
19
+ from typing import Any
20
+
21
+ from cudf_polars.utils.config import ConfigOptions
22
+
23
+
24
+ def wrap_arg(obj: Any) -> Any:
25
+ """
26
+ Make `obj` spillable if it is a DataFrame.
27
+
28
+ Parameters
29
+ ----------
30
+ obj
31
+ The object to be wrapped (if it is a DataFrame).
32
+
33
+ Returns
34
+ -------
35
+ A SpillableWrapper if obj is a DataFrame, otherwise the original object.
36
+ """
37
+ if isinstance(obj, DataFrame):
38
+ return SpillableWrapper(on_device=obj)
39
+ return obj
40
+
41
+
42
+ def unwrap_arg(obj: Any) -> Any:
43
+ """
44
+ Unwraps a SpillableWrapper to retrieve the original object.
45
+
46
+ Parameters
47
+ ----------
48
+ obj
49
+ The object to be unwrapped.
50
+
51
+ Returns
52
+ -------
53
+ The unwrapped obj is a SpillableWrapper, otherwise the original object.
54
+ """
55
+ if isinstance(obj, SpillableWrapper):
56
+ return obj.unspill()
57
+ return obj
58
+
59
+
60
+ def wrap_func_spillable(
61
+ func: Callable,
62
+ *,
63
+ make_func_output_spillable: bool,
64
+ target_partition_size: int,
65
+ ) -> Callable:
66
+ """
67
+ Wraps a function to handle spillable DataFrames.
68
+
69
+ Parameters
70
+ ----------
71
+ func
72
+ The function to be wrapped.
73
+ make_func_output_spillable
74
+ Whether to wrap the function's output in a SpillableWrapper.
75
+ target_partition_size
76
+ Target byte size for IO tasks.
77
+
78
+ Returns
79
+ -------
80
+ A wrapped function that processes spillable DataFrames.
81
+ """
82
+
83
+ def wrapper(*args: Any) -> Any:
84
+ # Make headroom before executing the task
85
+ headroom = 0
86
+ probable_io_task = True
87
+ for arg in args:
88
+ if isinstance(arg, SpillableWrapper):
89
+ if arg.mem_type() == MemoryType.HOST:
90
+ headroom += sizeof(arg._on_host)
91
+ probable_io_task = False
92
+ if probable_io_task:
93
+ # Likely an IO task - Assume we need target_partition_size
94
+ headroom = target_partition_size
95
+ if headroom > 128_000_000: # Don't waste time on smaller data
96
+ ctx = get_worker_context(get_worker())
97
+ with ctx.lock:
98
+ ctx.br.spill_manager.spill_to_make_headroom(headroom=headroom)
99
+
100
+ ret: Any = func(*(unwrap_arg(arg) for arg in args))
101
+ if make_func_output_spillable:
102
+ ret = wrap_arg(ret)
103
+ return ret
104
+
105
+ return wrapper
106
+
107
+
108
+ def wrap_dataframe_in_spillable(
109
+ graph: MutableMapping[Any, Any],
110
+ ignore_key: str | tuple[str, int],
111
+ config_options: ConfigOptions,
112
+ ) -> MutableMapping[Any, Any]:
113
+ """
114
+ Wraps functions within a task graph to handle spillable DataFrames.
115
+
116
+ Only supports flat task graphs where each DataFrame can be found in the
117
+ outermost level. Currently, this is true for all cudf-polars task graphs.
118
+
119
+ Parameters
120
+ ----------
121
+ graph
122
+ Task graph.
123
+ ignore_key
124
+ The key to ignore when wrapping function, typically the key of the
125
+ output node.
126
+ config_options
127
+ GPUEngine configuration options.
128
+
129
+ Returns
130
+ -------
131
+ A new task graph with wrapped functions.
132
+ """
133
+ assert config_options.executor.name == "streaming", (
134
+ "'in-memory' executor not supported in 'wrap_dataframe_in_spillable'"
135
+ )
136
+ target_partition_size = config_options.executor.target_partition_size
137
+
138
+ ret = {}
139
+ for key, task in graph.items():
140
+ assert isinstance(task, tuple)
141
+ ret[key] = tuple(
142
+ wrap_func_spillable(
143
+ a,
144
+ make_func_output_spillable=key != ignore_key,
145
+ target_partition_size=target_partition_size,
146
+ )
147
+ if callable(a)
148
+ else a
149
+ for a in task
150
+ )
151
+ return ret
@@ -0,0 +1,100 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ """Multi-partition utilities."""
4
+
5
+ from __future__ import annotations
6
+
7
+ import operator
8
+ import warnings
9
+ from functools import reduce
10
+ from itertools import chain
11
+ from typing import TYPE_CHECKING
12
+
13
+ from cudf_polars.dsl.expr import Col
14
+ from cudf_polars.dsl.ir import Union
15
+ from cudf_polars.experimental.base import PartitionInfo
16
+
17
+ if TYPE_CHECKING:
18
+ from collections.abc import MutableMapping
19
+
20
+ from cudf_polars.containers import DataFrame
21
+ from cudf_polars.dsl.expr import Expr
22
+ from cudf_polars.dsl.ir import IR
23
+ from cudf_polars.experimental.dispatch import LowerIRTransformer
24
+ from cudf_polars.utils.config import ConfigOptions
25
+
26
+
27
+ def _concat(*dfs: DataFrame) -> DataFrame:
28
+ # Concatenate a sequence of DataFrames vertically
29
+ return Union.do_evaluate(None, *dfs)
30
+
31
+
32
+ def _fallback_inform(msg: str, config_options: ConfigOptions) -> None:
33
+ """Inform the user of single-partition fallback."""
34
+ # Satisfy type checking
35
+ assert config_options.executor.name == "streaming", (
36
+ "'in-memory' executor not supported in '_fallback_inform'"
37
+ )
38
+
39
+ match fallback_mode := config_options.executor.fallback_mode:
40
+ case "warn":
41
+ warnings.warn(msg, stacklevel=2)
42
+ case "raise":
43
+ raise NotImplementedError(msg)
44
+ case "silent":
45
+ pass
46
+ case _: # pragma: no cover; Should never get here.
47
+ raise ValueError(
48
+ f"{fallback_mode} is not a supported 'fallback_mode' "
49
+ "option. Please use 'warn', 'raise', or 'silent'."
50
+ )
51
+
52
+
53
+ def _lower_ir_fallback(
54
+ ir: IR,
55
+ rec: LowerIRTransformer,
56
+ *,
57
+ msg: str | None = None,
58
+ ) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
59
+ # Catch-all single-partition lowering logic.
60
+ # If any children contain multiple partitions,
61
+ # those children will be collapsed with `Repartition`.
62
+ from cudf_polars.experimental.repartition import Repartition
63
+
64
+ # Lower children
65
+ lowered_children, _partition_info = zip(*(rec(c) for c in ir.children), strict=True)
66
+ partition_info = reduce(operator.or_, _partition_info)
67
+
68
+ # Ensure all children are single-partitioned
69
+ children = []
70
+ fallback = False
71
+ for c in lowered_children:
72
+ child = c
73
+ if partition_info[c].count > 1:
74
+ # Fall-back logic
75
+ fallback = True
76
+ child = Repartition(child.schema, child)
77
+ partition_info[child] = PartitionInfo(count=1)
78
+ children.append(child)
79
+
80
+ if fallback and msg:
81
+ # Warn/raise the user if any children were collapsed
82
+ # and the "fallback_mode" configuration is not "silent"
83
+ _fallback_inform(msg, rec.state["config_options"])
84
+
85
+ # Reconstruct and return
86
+ new_node = ir.reconstruct(children)
87
+ partition_info[new_node] = PartitionInfo(count=1)
88
+ return new_node, partition_info
89
+
90
+
91
+ def _leaf_column_names(expr: Expr) -> tuple[str, ...]:
92
+ """Find the leaf column names of an expression."""
93
+ if expr.children:
94
+ return tuple(
95
+ chain.from_iterable(_leaf_column_names(child) for child in expr.children)
96
+ )
97
+ elif isinstance(expr, Col):
98
+ return (expr.name,)
99
+ else:
100
+ return ()
@@ -5,24 +5,30 @@
5
5
 
6
6
  from __future__ import annotations
7
7
 
8
+ from pathlib import Path
8
9
  from typing import TYPE_CHECKING
9
10
 
11
+ import polars as pl
10
12
  from polars import GPUEngine
11
13
  from polars.testing.asserts import assert_frame_equal
12
14
 
13
15
  from cudf_polars.dsl.translate import Translator
14
16
 
15
17
  if TYPE_CHECKING:
16
- import polars as pl
17
-
18
18
  from cudf_polars.typing import OptimizationArgs
19
19
 
20
- __all__: list[str] = ["assert_gpu_result_equal", "assert_ir_translation_raises"]
21
20
 
21
+ __all__: list[str] = [
22
+ "assert_gpu_result_equal",
23
+ "assert_ir_translation_raises",
24
+ "assert_sink_ir_translation_raises",
25
+ "assert_sink_result_equal",
26
+ ]
22
27
 
23
28
  # Will be overriden by `conftest.py` with the value from the `--executor`
24
- # command-line argument
25
- Executor = None
29
+ # and `--scheduler` command-line arguments
30
+ DEFAULT_EXECUTOR = "in-memory"
31
+ DEFAULT_SCHEDULER = "synchronous"
26
32
 
27
33
 
28
34
  def assert_gpu_result_equal(
@@ -89,7 +95,14 @@ def assert_gpu_result_equal(
89
95
  If GPU collection failed in some way.
90
96
  """
91
97
  if engine is None:
92
- engine = GPUEngine(raise_on_fail=True, executor=executor or Executor)
98
+ executor = executor or DEFAULT_EXECUTOR
99
+ engine = GPUEngine(
100
+ raise_on_fail=True,
101
+ executor=executor,
102
+ executor_options=(
103
+ {"scheduler": DEFAULT_SCHEDULER} if executor == "streaming" else {}
104
+ ),
105
+ )
93
106
 
94
107
  final_polars_collect_kwargs, final_cudf_collect_kwargs = _process_kwargs(
95
108
  collect_kwargs, polars_collect_kwargs, cudf_collect_kwargs
@@ -236,3 +249,130 @@ def assert_collect_raises(
236
249
  else:
237
250
  if cudf_except != ():
238
251
  raise AssertionError(f"GPU execution DID NOT RAISE {cudf_except}")
252
+
253
+
254
+ def _resolve_sink_format(path: Path) -> str:
255
+ """Returns valid sink format for assert utilities."""
256
+ suffix = path.suffix.lower()
257
+ supported_ext = {
258
+ ".csv": "csv",
259
+ ".pq": "parquet",
260
+ ".parquet": "parquet",
261
+ ".json": "ndjson",
262
+ ".ndjson": "ndjson",
263
+ }
264
+ if suffix not in supported_ext:
265
+ raise ValueError(f"Unsupported file format: {suffix}")
266
+ return supported_ext[suffix]
267
+
268
+
269
+ def assert_sink_result_equal(
270
+ lazydf: pl.LazyFrame,
271
+ path: str | Path,
272
+ *,
273
+ engine: str | GPUEngine | None = None,
274
+ read_kwargs: dict | None = None,
275
+ write_kwargs: dict | None = None,
276
+ executor: str | None = None,
277
+ ) -> None:
278
+ """
279
+ Assert that writing a LazyFrame via sink produces the same output.
280
+
281
+ Parameters
282
+ ----------
283
+ lazydf
284
+ The LazyFrame to sink.
285
+ path
286
+ The file path to use. Suffix must be one of:
287
+ '.csv', '.parquet', '.pq', '.json', '.ndjson'.
288
+ engine
289
+ The GPU engine to use for the sink operation.
290
+ read_kwargs
291
+ Optional keyword arguments to pass to the corresponding `pl.read_*` function.
292
+ write_kwargs
293
+ Optional keyword arguments to pass to the corresponding `sink_*` function.
294
+ executor
295
+ The executor configuration to pass to `GPUEngine`. If not specified
296
+ uses the module level `Executor` attribute.
297
+
298
+ Raises
299
+ ------
300
+ AssertionError
301
+ If the outputs from CPU and GPU sink differ.
302
+ ValueError
303
+ If the file extension is not one of the supported formats.
304
+ """
305
+ if engine is None:
306
+ executor = executor or DEFAULT_EXECUTOR
307
+ engine = GPUEngine(
308
+ raise_on_fail=True,
309
+ executor=executor,
310
+ executor_options=(
311
+ {"scheduler": DEFAULT_SCHEDULER} if executor == "streaming" else {}
312
+ ),
313
+ )
314
+ path = Path(path)
315
+ read_kwargs = read_kwargs or {}
316
+ write_kwargs = write_kwargs or {}
317
+
318
+ fmt = _resolve_sink_format(path)
319
+
320
+ cpu_path = path.with_name(f"{path.stem}_cpu{path.suffix}")
321
+ gpu_path = path.with_name(f"{path.stem}_gpu{path.suffix}")
322
+
323
+ sink_fn = getattr(lazydf, f"sink_{fmt}")
324
+ read_fn = getattr(pl, f"read_{fmt}")
325
+
326
+ sink_fn(cpu_path, **write_kwargs)
327
+ sink_fn(gpu_path, engine=engine, **write_kwargs)
328
+
329
+ expected = read_fn(cpu_path, **read_kwargs)
330
+ result = read_fn(gpu_path, **read_kwargs)
331
+
332
+ assert_frame_equal(expected, result)
333
+
334
+
335
+ def assert_sink_ir_translation_raises(
336
+ lazydf: pl.LazyFrame,
337
+ path: str | Path,
338
+ write_kwargs: dict,
339
+ *exceptions: type[Exception],
340
+ ) -> None:
341
+ """
342
+ Assert that translation of a sink query raises an exception.
343
+
344
+ Parameters
345
+ ----------
346
+ lazydf
347
+ The LazyFrame to sink.
348
+ path
349
+ The file path. Must have one of the supported suffixes.
350
+ write_kwargs
351
+ Keyword arguments to pass to the `sink_*` method.
352
+ *exceptions
353
+ One or more expected exception types that should be raised during translation.
354
+
355
+ Raises
356
+ ------
357
+ AssertionError
358
+ If translation does not raise any of the expected exceptions.
359
+ If an exception occurs before translation begins.
360
+ ValueError
361
+ If the file extension is not one of the supported formats.
362
+ """
363
+ path = Path(path)
364
+ fmt = _resolve_sink_format(path)
365
+
366
+ try:
367
+ lazy_sink = getattr(lazydf, f"sink_{fmt}")(
368
+ path,
369
+ engine="gpu",
370
+ lazy=True,
371
+ **write_kwargs,
372
+ )
373
+ except Exception as e:
374
+ raise AssertionError(
375
+ f"Sink function raised an exception before translation: {e}"
376
+ ) from e
377
+
378
+ assert_ir_translation_raises(lazy_sink, *exceptions)
@@ -0,0 +1,72 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ """IO testing utilities."""
5
+
6
+ from __future__ import annotations
7
+
8
+ from pathlib import Path
9
+ from typing import TYPE_CHECKING
10
+
11
+ if TYPE_CHECKING:
12
+ from typing import Literal
13
+
14
+ import polars as pl
15
+
16
+ __all__: list[str] = ["make_partitioned_source"]
17
+
18
+
19
+ def make_partitioned_source(
20
+ df: pl.DataFrame,
21
+ path: str | Path,
22
+ fmt: Literal["csv", "ndjson", "parquet", "chunked_parquet"],
23
+ *,
24
+ n_files: int = 1,
25
+ row_group_size: int | None = None,
26
+ write_kwargs: dict | None = None,
27
+ ) -> None:
28
+ """
29
+ Write the Polars DataFrame to one or more files of the desired format.
30
+
31
+ Parameters
32
+ ----------
33
+ df : polars.DataFrame
34
+ The input DataFrame to write.
35
+ path : str | pathlib.Path
36
+ The base path to write the file(s) to.
37
+ fmt : Literal["csv", "ndjson", "parquet", "chunked_parquet"]
38
+ The format to write in.
39
+ n_files : int, default 1
40
+ If greater than 1, splits the data into multiple files.
41
+ row_group_size : optional, int
42
+ Only used for Parquet. Specifies the row group size per file.
43
+ write_kwargs : dict, optional
44
+ Additional keyword arguments to pass to the write_* functions.
45
+ """
46
+ path = Path(path)
47
+ write_kwargs = write_kwargs or {}
48
+
49
+ def write(part: pl.DataFrame, file_path: Path) -> None:
50
+ match fmt:
51
+ case "csv":
52
+ part.write_csv(file_path, **write_kwargs)
53
+ case "ndjson":
54
+ part.write_ndjson(file_path, **write_kwargs)
55
+ case "parquet" | "chunked_parquet":
56
+ part.write_parquet(
57
+ file_path,
58
+ row_group_size=row_group_size or (len(part) // 2),
59
+ **write_kwargs,
60
+ )
61
+ case _:
62
+ raise ValueError(f"Unsupported format: {fmt}")
63
+
64
+ if n_files == 1:
65
+ if path.is_dir():
66
+ path = path / f"part.0.{fmt}"
67
+ write(df, path)
68
+ else:
69
+ stride = len(df) // n_files
70
+ for i, part in enumerate(df.iter_slices(stride)):
71
+ file_path = path / f"part.{i}.{fmt}"
72
+ write(part, file_path)