cudf-polars-cu13 25.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. cudf_polars/GIT_COMMIT +1 -0
  2. cudf_polars/VERSION +1 -0
  3. cudf_polars/__init__.py +28 -0
  4. cudf_polars/_version.py +21 -0
  5. cudf_polars/callback.py +318 -0
  6. cudf_polars/containers/__init__.py +13 -0
  7. cudf_polars/containers/column.py +495 -0
  8. cudf_polars/containers/dataframe.py +361 -0
  9. cudf_polars/containers/datatype.py +137 -0
  10. cudf_polars/dsl/__init__.py +8 -0
  11. cudf_polars/dsl/expr.py +66 -0
  12. cudf_polars/dsl/expressions/__init__.py +8 -0
  13. cudf_polars/dsl/expressions/aggregation.py +226 -0
  14. cudf_polars/dsl/expressions/base.py +272 -0
  15. cudf_polars/dsl/expressions/binaryop.py +120 -0
  16. cudf_polars/dsl/expressions/boolean.py +326 -0
  17. cudf_polars/dsl/expressions/datetime.py +271 -0
  18. cudf_polars/dsl/expressions/literal.py +97 -0
  19. cudf_polars/dsl/expressions/rolling.py +643 -0
  20. cudf_polars/dsl/expressions/selection.py +74 -0
  21. cudf_polars/dsl/expressions/slicing.py +46 -0
  22. cudf_polars/dsl/expressions/sorting.py +85 -0
  23. cudf_polars/dsl/expressions/string.py +1002 -0
  24. cudf_polars/dsl/expressions/struct.py +137 -0
  25. cudf_polars/dsl/expressions/ternary.py +49 -0
  26. cudf_polars/dsl/expressions/unary.py +517 -0
  27. cudf_polars/dsl/ir.py +2607 -0
  28. cudf_polars/dsl/nodebase.py +164 -0
  29. cudf_polars/dsl/to_ast.py +359 -0
  30. cudf_polars/dsl/tracing.py +16 -0
  31. cudf_polars/dsl/translate.py +939 -0
  32. cudf_polars/dsl/traversal.py +224 -0
  33. cudf_polars/dsl/utils/__init__.py +8 -0
  34. cudf_polars/dsl/utils/aggregations.py +481 -0
  35. cudf_polars/dsl/utils/groupby.py +98 -0
  36. cudf_polars/dsl/utils/naming.py +34 -0
  37. cudf_polars/dsl/utils/replace.py +61 -0
  38. cudf_polars/dsl/utils/reshape.py +74 -0
  39. cudf_polars/dsl/utils/rolling.py +121 -0
  40. cudf_polars/dsl/utils/windows.py +192 -0
  41. cudf_polars/experimental/__init__.py +8 -0
  42. cudf_polars/experimental/base.py +386 -0
  43. cudf_polars/experimental/benchmarks/__init__.py +4 -0
  44. cudf_polars/experimental/benchmarks/pdsds.py +220 -0
  45. cudf_polars/experimental/benchmarks/pdsds_queries/__init__.py +4 -0
  46. cudf_polars/experimental/benchmarks/pdsds_queries/q1.py +88 -0
  47. cudf_polars/experimental/benchmarks/pdsds_queries/q10.py +225 -0
  48. cudf_polars/experimental/benchmarks/pdsds_queries/q2.py +244 -0
  49. cudf_polars/experimental/benchmarks/pdsds_queries/q3.py +65 -0
  50. cudf_polars/experimental/benchmarks/pdsds_queries/q4.py +359 -0
  51. cudf_polars/experimental/benchmarks/pdsds_queries/q5.py +462 -0
  52. cudf_polars/experimental/benchmarks/pdsds_queries/q6.py +92 -0
  53. cudf_polars/experimental/benchmarks/pdsds_queries/q7.py +79 -0
  54. cudf_polars/experimental/benchmarks/pdsds_queries/q8.py +524 -0
  55. cudf_polars/experimental/benchmarks/pdsds_queries/q9.py +137 -0
  56. cudf_polars/experimental/benchmarks/pdsh.py +814 -0
  57. cudf_polars/experimental/benchmarks/utils.py +832 -0
  58. cudf_polars/experimental/dask_registers.py +200 -0
  59. cudf_polars/experimental/dispatch.py +156 -0
  60. cudf_polars/experimental/distinct.py +197 -0
  61. cudf_polars/experimental/explain.py +157 -0
  62. cudf_polars/experimental/expressions.py +590 -0
  63. cudf_polars/experimental/groupby.py +327 -0
  64. cudf_polars/experimental/io.py +943 -0
  65. cudf_polars/experimental/join.py +391 -0
  66. cudf_polars/experimental/parallel.py +423 -0
  67. cudf_polars/experimental/repartition.py +69 -0
  68. cudf_polars/experimental/scheduler.py +155 -0
  69. cudf_polars/experimental/select.py +188 -0
  70. cudf_polars/experimental/shuffle.py +354 -0
  71. cudf_polars/experimental/sort.py +609 -0
  72. cudf_polars/experimental/spilling.py +151 -0
  73. cudf_polars/experimental/statistics.py +795 -0
  74. cudf_polars/experimental/utils.py +169 -0
  75. cudf_polars/py.typed +0 -0
  76. cudf_polars/testing/__init__.py +8 -0
  77. cudf_polars/testing/asserts.py +448 -0
  78. cudf_polars/testing/io.py +122 -0
  79. cudf_polars/testing/plugin.py +236 -0
  80. cudf_polars/typing/__init__.py +219 -0
  81. cudf_polars/utils/__init__.py +8 -0
  82. cudf_polars/utils/config.py +741 -0
  83. cudf_polars/utils/conversion.py +40 -0
  84. cudf_polars/utils/dtypes.py +118 -0
  85. cudf_polars/utils/sorting.py +53 -0
  86. cudf_polars/utils/timer.py +39 -0
  87. cudf_polars/utils/versions.py +27 -0
  88. cudf_polars_cu13-25.10.0.dist-info/METADATA +136 -0
  89. cudf_polars_cu13-25.10.0.dist-info/RECORD +92 -0
  90. cudf_polars_cu13-25.10.0.dist-info/WHEEL +5 -0
  91. cudf_polars_cu13-25.10.0.dist-info/licenses/LICENSE +201 -0
  92. cudf_polars_cu13-25.10.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,122 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ """IO testing utilities."""
5
+
6
+ from __future__ import annotations
7
+
8
+ from pathlib import Path
9
+ from typing import TYPE_CHECKING
10
+
11
+ import polars as pl
12
+
13
+ if TYPE_CHECKING:
14
+ from typing import Literal
15
+
16
+ __all__: list[str] = ["make_partitioned_source"]
17
+
18
+
19
+ def make_partitioned_source(
20
+ df: pl.DataFrame,
21
+ path: str | Path,
22
+ fmt: Literal["csv", "ndjson", "parquet", "chunked_parquet"],
23
+ *,
24
+ n_files: int = 1,
25
+ row_group_size: int | None = None,
26
+ write_kwargs: dict | None = None,
27
+ ) -> None:
28
+ """
29
+ Write the Polars DataFrame to one or more files of the desired format.
30
+
31
+ Parameters
32
+ ----------
33
+ df : polars.DataFrame
34
+ The input DataFrame to write.
35
+ path : str | pathlib.Path
36
+ The base path to write the file(s) to.
37
+ fmt : Literal["csv", "ndjson", "parquet", "chunked_parquet"]
38
+ The format to write in.
39
+ n_files : int, default 1
40
+ If greater than 1, splits the data into multiple files.
41
+ row_group_size : optional, int
42
+ Only used for Parquet. Specifies the row group size per file.
43
+ write_kwargs : dict, optional
44
+ Additional keyword arguments to pass to the write_* functions.
45
+ """
46
+ path = Path(path)
47
+ write_kwargs = write_kwargs or {}
48
+
49
+ def write(part: pl.DataFrame, file_path: Path) -> None:
50
+ match fmt:
51
+ case "csv":
52
+ part.write_csv(file_path, **write_kwargs)
53
+ case "ndjson":
54
+ part.write_ndjson(file_path, **write_kwargs)
55
+ case "parquet" | "chunked_parquet":
56
+ part.write_parquet(
57
+ file_path,
58
+ row_group_size=row_group_size or (len(part) // 2),
59
+ **write_kwargs,
60
+ )
61
+ case _:
62
+ raise ValueError(f"Unsupported format: {fmt}")
63
+
64
+ if n_files == 1:
65
+ if path.is_dir():
66
+ path = path / f"part.0.{fmt}"
67
+ write(df, path)
68
+ else:
69
+ stride = len(df) // n_files
70
+ for i, part in enumerate(df.iter_slices(stride)):
71
+ file_path = path / f"part.{i}.{fmt}"
72
+ write(part, file_path)
73
+
74
+
75
+ def make_lazy_frame(
76
+ df: pl.DataFrame,
77
+ fmt: Literal["csv", "parquet", "frame"],
78
+ *,
79
+ path: str | Path | None = None,
80
+ n_files: int = 1,
81
+ n_rows: int | None = None,
82
+ ) -> pl.LazyFrame:
83
+ """
84
+ Returns a pl.LazyFrame from a pl.DataFrame.
85
+
86
+ Parameters
87
+ ----------
88
+ df : polars.DataFrame
89
+ The input DataFrame to convert to a LazyFrame.
90
+ path : str | pathlib.Path
91
+ The base path to write the file(s) to.
92
+ This option is ignored if fmt is "frame".
93
+ fmt : Literal["parquet", "csv", "frame"]
94
+ The format to use for IO.
95
+ n_files : int, default 1
96
+ If greater than 1, splits the data into multiple files.
97
+ This option is ignored if fmt is "frame".
98
+ n_rows : optional, int
99
+ Slice to apply to the final LazyFrame before returning.
100
+ """
101
+ from cudf_polars.experimental.io import _clear_source_info_cache
102
+
103
+ _clear_source_info_cache()
104
+
105
+ if fmt == "frame":
106
+ if n_rows is not None:
107
+ return df.slice(0, n_rows).lazy()
108
+ return df.lazy()
109
+ else:
110
+ assert path is not None, f"path is required for fmt={fmt}."
111
+ row_group_size: int | None = None
112
+ if fmt == "parquet":
113
+ read = pl.scan_parquet
114
+ row_group_size = 10
115
+ elif fmt == "csv":
116
+ read = pl.scan_csv
117
+ else: # pragma: no cover
118
+ raise ValueError(f"Unsupported format: {fmt}")
119
+ make_partitioned_source(
120
+ df, path, fmt=fmt, n_files=n_files, row_group_size=row_group_size
121
+ )
122
+ return read(path, n_rows=n_rows) if n_rows is not None else read(path)
@@ -0,0 +1,236 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ """Plugin for running polars test suite setting GPU engine as default."""
5
+
6
+ from __future__ import annotations
7
+
8
+ from functools import partialmethod
9
+ from typing import TYPE_CHECKING
10
+
11
+ import pytest
12
+
13
+ import polars
14
+
15
+ if TYPE_CHECKING:
16
+ from collections.abc import Mapping
17
+
18
+
19
+ def pytest_addoption(parser: pytest.Parser) -> None:
20
+ """Add plugin-specific options."""
21
+ group = parser.getgroup(
22
+ "cudf-polars", "Plugin to set GPU as default engine for polars tests"
23
+ )
24
+ group.addoption(
25
+ "--cudf-polars-no-fallback",
26
+ action="store_true",
27
+ help="Turn off fallback to CPU when running tests (default use fallback)",
28
+ )
29
+
30
+
31
+ def pytest_configure(config: pytest.Config) -> None:
32
+ """Enable use of this module as a pytest plugin to enable GPU collection."""
33
+ no_fallback = config.getoption("--cudf-polars-no-fallback")
34
+ if no_fallback:
35
+ collect = polars.LazyFrame.collect
36
+ engine = polars.GPUEngine(raise_on_fail=no_fallback)
37
+ # https://github.com/python/mypy/issues/2427
38
+ polars.LazyFrame.collect = partialmethod(collect, engine=engine) # type: ignore[method-assign,assignment]
39
+ else:
40
+ polars.Config.set_engine_affinity("gpu")
41
+ config.addinivalue_line(
42
+ "filterwarnings",
43
+ "ignore:.*GPU engine does not support streaming or background collection",
44
+ )
45
+ config.addinivalue_line(
46
+ "filterwarnings",
47
+ "ignore:.*Query execution with GPU not possible",
48
+ )
49
+
50
+
51
+ EXPECTED_FAILURES: Mapping[str, str | tuple[str, bool]] = {
52
+ "tests/unit/io/test_csv.py::test_compressed_csv": "Need to determine if file is compressed",
53
+ "tests/unit/io/test_csv.py::test_read_csv_only_loads_selected_columns": "Memory usage won't be correct due to GPU",
54
+ "tests/unit/io/test_delta.py::test_scan_delta_version": "Need to expose hive partitioning",
55
+ "tests/unit/io/test_delta.py::test_scan_delta_relative": "Need to expose hive partitioning",
56
+ "tests/unit/io/test_delta.py::test_read_delta_version": "Need to expose hive partitioning",
57
+ "tests/unit/io/test_delta.py::test_scan_delta_schema_evolution_nested_struct_field_19915": "Need to expose hive partitioning",
58
+ "tests/unit/io/test_delta.py::test_scan_delta_nanosecond_timestamp": "polars generates the wrong schema: https://github.com/pola-rs/polars/issues/23949",
59
+ "tests/unit/io/test_delta.py::test_scan_delta_nanosecond_timestamp_nested": "polars generates the wrong schema: https://github.com/pola-rs/polars/issues/23949",
60
+ "tests/unit/io/test_lazy_count_star.py::test_count_compressed_csv_18057": "Need to determine if file is compressed",
61
+ "tests/unit/io/test_lazy_count_star.py::test_count_parquet[small.parquet-4]": "Debug output on stderr doesn't match",
62
+ "tests/unit/io/test_lazy_count_star.py::test_count_parquet[foods*.parquet-54]": "Debug output on stderr doesn't match",
63
+ "tests/unit/io/test_lazy_csv.py::test_scan_csv_slice_offset_zero": "Integer overflow in sliced read",
64
+ "tests/unit/io/test_lazy_parquet.py::test_parquet_is_in_statistics": "Debug output on stderr doesn't match",
65
+ "tests/unit/io/test_lazy_parquet.py::test_parquet_statistics": "Debug output on stderr doesn't match",
66
+ "tests/unit/io/test_partition.py::test_partition_to_memory[io_type0]": "partition sinks not yet supported in standard engine.",
67
+ "tests/unit/io/test_partition.py::test_partition_to_memory[io_type1]": "partition sinks not yet supported in standard engine.",
68
+ "tests/unit/io/test_partition.py::test_partition_to_memory[io_type2]": "partition sinks not yet supported in standard engine.",
69
+ "tests/unit/io/test_partition.py::test_partition_to_memory[io_type3]": "partition sinks not yet supported in standard engine.",
70
+ "tests/unit/io/test_partition.py::test_partition_to_memory_finish_callback[io_type1]": "partition sinks not yet supported in standard engine.",
71
+ "tests/unit/io/test_partition.py::test_partition_to_memory_finish_callback[io_type2]": "partition sinks not yet supported in standard engine.",
72
+ "tests/unit/io/test_partition.py::test_partition_to_memory_finish_callback[io_type3]": "partition sinks not yet supported in standard engine.",
73
+ "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df1-a-io_type3]": "partition sinks not yet supported in standard engine.",
74
+ "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df2-sorts2-io_type0]": "partition sinks not yet supported in standard engine.",
75
+ "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df2-sorts2-io_type1]": "partition sinks not yet supported in standard engine.",
76
+ "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df2-sorts2-io_type2]": "partition sinks not yet supported in standard engine.",
77
+ "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df2-sorts2-io_type3]": "partition sinks not yet supported in standard engine.",
78
+ "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df3-b-io_type0]": "partition sinks not yet supported in standard engine.",
79
+ "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df3-b-io_type1]": "partition sinks not yet supported in standard engine.",
80
+ "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df3-b-io_type2]": "partition sinks not yet supported in standard engine.",
81
+ "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df3-b-io_type3]": "partition sinks not yet supported in standard engine.",
82
+ "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df4-sorts4-io_type0]": "partition sinks not yet supported in standard engine.",
83
+ "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df4-sorts4-io_type1]": "partition sinks not yet supported in standard engine.",
84
+ "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df4-sorts4-io_type2]": "partition sinks not yet supported in standard engine.",
85
+ "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df4-sorts4-io_type3]": "partition sinks not yet supported in standard engine.",
86
+ "tests/unit/io/test_partition.py::test_partition_to_memory_finish_callback[io_type0]": "partition sinks not yet supported in standard engine.",
87
+ "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df0-a-io_type0]": "partition sinks not yet supported in standard engine.",
88
+ "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df0-a-io_type1]": "partition sinks not yet supported in standard engine.",
89
+ "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df0-a-io_type2]": "partition sinks not yet supported in standard engine.",
90
+ "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df0-a-io_type3]": "partition sinks not yet supported in standard engine.",
91
+ "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df1-a-io_type0]": "partition sinks not yet supported in standard engine.",
92
+ "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df1-a-io_type1]": "partition sinks not yet supported in standard engine.",
93
+ "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df1-a-io_type2]": "partition sinks not yet supported in standard engine.",
94
+ "tests/unit/io/test_lazy_parquet.py::test_scan_parquet_ignores_dtype_mismatch_for_non_projected_columns_19249[False-False]": "Needs some variant of cudf#16394",
95
+ "tests/unit/io/test_lazy_parquet.py::test_scan_parquet_ignores_dtype_mismatch_for_non_projected_columns_19249[True-False]": "Needs some variant of cudf#16394",
96
+ "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read[False]": "Incomplete handling of projected reads with mismatching schemas, cudf#16394",
97
+ "tests/unit/io/test_parquet.py::test_read_parquet_only_loads_selected_columns_15098": "Memory usage won't be correct due to GPU",
98
+ "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-none]": "Mismatching column read cudf#16394",
99
+ "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-none]": "Mismatching column read cudf#16394",
100
+ "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-prefiltered]": "Mismatching column read cudf#16394",
101
+ "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-prefiltered]": "Mismatching column read cudf#16394",
102
+ "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-row_groups]": "Mismatching column read cudf#16394",
103
+ "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-row_groups]": "Mismatching column read cudf#16394",
104
+ "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-columns]": "Mismatching column read cudf#16394",
105
+ "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-columns]": "Mismatching column read cudf#16394",
106
+ "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-none]": "Mismatching column read cudf#16394",
107
+ "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-none]": "Mismatching column read cudf#16394",
108
+ "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-prefiltered]": "Mismatching column read cudf#16394",
109
+ "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-prefiltered]": "Mismatching column read cudf#16394",
110
+ "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-row_groups]": "Mismatching column read cudf#16394",
111
+ "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-row_groups]": "Mismatching column read cudf#16394",
112
+ "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-columns]": "Mismatching column read cudf#16394",
113
+ "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-columns]": "Mismatching column read cudf#16394",
114
+ "tests/unit/io/test_parquet.py::test_scan_parquet_filter_statistics_load_missing_column_21391": "Mismatching column read cudf#16394",
115
+ "tests/unit/io/test_parquet.py::test_field_overwrites_metadata": "cannot serialize in-memory sink target.",
116
+ "tests/unit/io/test_parquet_field_overwrites.py::test_required_flat": "cannot serialize in-memory sink target.",
117
+ "tests/unit/io/test_parquet_field_overwrites.py::test_required_list[dtype0]": "cannot serialize in-memory sink target.",
118
+ "tests/unit/io/test_parquet_field_overwrites.py::test_required_list[dtype1]": "cannot serialize in-memory sink target.",
119
+ "tests/unit/io/test_parquet_field_overwrites.py::test_required_struct": "cannot serialize in-memory sink target.",
120
+ "tests/unit/lazyframe/test_engine_selection.py::test_engine_import_error_raises[gpu]": "Expect this to pass because cudf-polars is installed",
121
+ "tests/unit/lazyframe/test_engine_selection.py::test_engine_import_error_raises[engine1]": "Expect this to pass because cudf-polars is installed",
122
+ "tests/unit/lazyframe/test_lazyframe.py::test_round[dtype1-123.55-1-123.6]": "Rounding midpoints is handled incorrectly",
123
+ "tests/unit/lazyframe/test_lazyframe.py::test_cast_frame": "Casting that raises not supported on GPU",
124
+ "tests/unit/lazyframe/test_lazyframe.py::test_lazy_cache_hit": "Debug output on stderr doesn't match",
125
+ "tests/unit/lazyframe/test_collect_schema.py::test_collect_schema_parametric": "polars returns decimal column with precision=None",
126
+ "tests/unit/operations/aggregation/test_aggregations.py::test_binary_op_agg_context_no_simplify_expr_12423": "groupby-agg of just literals should not produce collect_list",
127
+ "tests/unit/operations/aggregation/test_aggregations.py::test_nan_inf_aggregation": "treatment of nans and nulls together is different in libcudf and polars in groupby-agg context",
128
+ "tests/unit/operations/test_abs.py::test_abs_duration": "Need to raise for unsupported uops on timelike values",
129
+ "tests/unit/operations/test_group_by.py::test_group_by_shorthand_quantile": "libcudf quantiles are round to nearest ties to even, polars quantiles are round to nearest ties away from zero",
130
+ "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype",
131
+ "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input11-expected11-input_dtype11-output_dtype11]": "Unsupported groupby-agg for a particular dtype",
132
+ "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input12-expected12-input_dtype12-output_dtype12]": "Unsupported groupby-agg for a particular dtype",
133
+ "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input13-expected13-input_dtype13-output_dtype13]": "Unsupported groupby-agg for a particular dtype",
134
+ "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype",
135
+ "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input11-expected11-input_dtype11-output_dtype11]": "Unsupported groupby-agg for a particular dtype",
136
+ "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input12-expected12-input_dtype12-output_dtype12]": "Unsupported groupby-agg for a particular dtype",
137
+ "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input13-expected13-input_dtype13-output_dtype13]": "Unsupported groupby-agg for a particular dtype",
138
+ "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input14-expected14-input_dtype14-output_dtype14]": "Unsupported groupby-agg for a particular dtype",
139
+ "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input15-expected15-input_dtype15-output_dtype15]": "Unsupported groupby-agg for a particular dtype",
140
+ "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input16-expected16-input_dtype16-output_dtype16]": "Unsupported groupby-agg for a particular dtype",
141
+ "tests/unit/operations/test_group_by.py::test_group_by_binary_agg_with_literal": "Incorrect broadcasting of literals in groupby-agg",
142
+ "tests/unit/operations/test_group_by.py::test_group_by_lit_series": "Incorrect broadcasting of literals in groupby-agg",
143
+ "tests/unit/operations/test_group_by.py::test_group_by_series_lit_22103[False]": "Incorrect broadcasting of literals in groupby-agg",
144
+ "tests/unit/operations/test_group_by.py::test_group_by_series_lit_22103[True]": "Incorrect broadcasting of literals in groupby-agg",
145
+ "tests/unit/operations/test_join.py::test_cross_join_slice_pushdown": "Need to implement slice pushdown for cross joins",
146
+ "tests/unit/operations/test_join.py::test_join_filter_pushdown_iejoin": "Row order differs due to multiple matches per left row index; join results are correct but unsorted",
147
+ "tests/unit/operations/namespaces/string/test_pad.py::test_str_zfill_unicode_not_respected": "polars doesn't add zeros for unicode characters.",
148
+ "tests/unit/sql/test_cast.py::test_cast_errors[values0-values::uint8-conversion from `f64` to `u64` failed]": "Casting that raises not supported on GPU",
149
+ "tests/unit/sql/test_cast.py::test_cast_errors[values1-values::uint4-conversion from `i64` to `u32` failed]": "Casting that raises not supported on GPU",
150
+ "tests/unit/sql/test_cast.py::test_cast_errors[values2-values::int1-conversion from `i64` to `i8` failed]": "Casting that raises not supported on GPU",
151
+ "tests/unit/sql/test_cast.py::test_cast_errors[values5-values::int4-conversion from `str` to `i32` failed]": "Cast raises, but error user receives is wrong",
152
+ "tests/unit/sql/test_miscellaneous.py::test_read_csv": "Incorrect handling of missing_is_null in read_csv",
153
+ "tests/unit/sql/test_wildcard_opts.py::test_select_wildcard_errors": "Raises correctly but with different exception",
154
+ "tests/unit/test_cse.py::test_cse_predicate_self_join": "Debug output on stderr doesn't match",
155
+ "tests/unit/test_cse.py::test_nested_cache_no_panic_16553": "Needs https://github.com/rapidsai/cudf/issues/18630",
156
+ "tests/unit/test_errors.py::test_error_on_empty_group_by": "Incorrect exception raised",
157
+ "tests/unit/test_predicates.py::test_predicate_pushdown_split_pushable": "Casting that raises not supported on GPU",
158
+ "tests/unit/io/test_scan_row_deletion.py::test_scan_row_deletion_skips_file_with_all_rows_deleted": "The test intentionally corrupts the parquet file, so we cannot read the row count from the header.",
159
+ "tests/unit/io/test_multiscan.py::test_multiscan_row_index[scan_csv-write_csv-csv]": "Debug output on stderr doesn't match",
160
+ "tests/unit/functions/range/test_linear_space.py::test_linear_space_date": "Needs https://github.com/pola-rs/polars/issues/23020",
161
+ "tests/unit/sql/test_temporal.py::test_implicit_temporal_strings[dt IN ('1960-01-07','2077-01-01','2222-02-22')-expected15]": "Needs https://github.com/pola-rs/polars/issues/23020",
162
+ "tests/unit/sql/test_operators.py::test_in_not_in[dt NOT IN ('1950-12-24', '1997-07-05')]": "Needs https://github.com/pola-rs/polars/issues/23020",
163
+ "tests/unit/sql/test_operators.py::test_in_not_in[dt IN ('2020-10-10', '2077-03-18')]": "Needs https://github.com/pola-rs/polars/issues/23020",
164
+ "tests/unit/datatypes/test_struct.py::test_struct_agg_all": "Needs nested list[struct] support",
165
+ "tests/unit/constructors/test_structs.py::test_constructor_non_strict_schema_17956": "Needs nested list[struct] support",
166
+ "tests/unit/io/test_delta.py::test_read_delta_arrow_map_type": "Needs nested list[struct] support",
167
+ "tests/unit/datatypes/test_struct.py::test_struct_null_cast": "pylibcudf.Scalar does not support struct scalars",
168
+ "tests/unit/datatypes/test_struct.py::test_struct_outer_nullability_zip_18119": "pylibcudf.Scalar does not support struct scalars",
169
+ "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[True-columns]": "allow_missing_columns argument in read_parquet not translated in IR",
170
+ "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[True-row_groups]": "allow_missing_columns argument in read_parquet not translated in IR",
171
+ "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[True-prefiltered]": "allow_missing_columns argument in read_parquet not translated in IR",
172
+ "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[True-none]": "allow_missing_columns argument in read_parquet not translated in IR",
173
+ "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-columns]": "allow_missing_columns argument in read_parquet not translated in IR",
174
+ "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-row_groups]": "allow_missing_columns argument in read_parquet not translated in IR",
175
+ "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-prefiltered]": "allow_missing_columns argument in read_parquet not translated in IR",
176
+ "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-none]": "allow_missing_columns argument in read_parquet not translated in IR",
177
+ "tests/unit/datatypes/test_decimal.py::test_decimal_aggregations": "https://github.com/pola-rs/polars/issues/23899",
178
+ "tests/unit/datatypes/test_decimal.py::test_decimal_arithmetic_schema": "https://github.com/pola-rs/polars/issues/23899",
179
+ }
180
+
181
+
182
+ TESTS_TO_SKIP: Mapping[str, str] = {
183
+ "tests/unit/operations/test_profile.py::test_profile_with_cse": "Shape assertion won't match",
184
+ # On Ubuntu 20.04, the tzdata package contains a bunch of symlinks
185
+ # for obsolete timezone names. However, the chrono_tz package that
186
+ # polars uses doesn't read /usr/share/zoneinfo, instead packaging
187
+ # the current zoneinfo database from IANA. Consequently, when this
188
+ # hypothesis-generated test runs and generates timezones from the
189
+ # available zoneinfo-reported timezones, we can get an error from
190
+ # polars that the requested timezone is unknown.
191
+ # Since this is random, just skip it, rather than xfailing.
192
+ "tests/unit/lazyframe/test_serde.py::test_lf_serde_roundtrip_binary": "chrono_tz doesn't have all tzdata symlink names",
193
+ # Tests performance difference of CPU engine
194
+ "tests/unit/operations/test_join.py::test_join_where_eager_perf_21145": "Tests performance bug in CPU engine",
195
+ "tests/unit/operations/namespaces/list/test_list.py::test_list_struct_field_perf": "Tests CPU Engine perf",
196
+ "tests/benchmark/test_with_columns.py::test_with_columns_quadratic_19503": "Tests performance bug in CPU engine",
197
+ # The test may segfault with the legacy streaming engine. We should
198
+ # remove this skip when all polars tests use the new streaming engine.
199
+ "tests/unit/streaming/test_streaming_group_by.py::test_streaming_group_by_literal[1]": "May segfault w/the legacy streaming engine",
200
+ # Fails in CI, but passes locally
201
+ "tests/unit/streaming/test_streaming.py::test_streaming_streamable_functions": "RuntimeError: polars_python::sql::PySQLContext is unsendable, but is being dropped on another thread",
202
+ # Remove when polars supports Pydantic V3
203
+ "tests/unit/constructors/test_constructors.py::test_init_structured_objects": "pydantic deprecation warning",
204
+ "tests/unit/constructors/test_constructors.py::test_init_pydantic_2x": "pydantic deprecation warning",
205
+ "tests/unit/constructors/test_constructors.py::test_init_structured_objects_nested[_TestFooPD-_TestBarPD-_TestBazPD]": "pydantic deprecation warning",
206
+ "tests/unit/series/test_series.py::test_init_structured_objects": "pydantic deprecation warning",
207
+ "tests/unit/series/test_describe.py::test_series_describe_float": "https://github.com/rapidsai/cudf/issues/19324",
208
+ "tests/unit/series/test_describe.py::test_series_describe_int": "https://github.com/rapidsai/cudf/issues/19324",
209
+ "tests/unit/streaming/test_streaming.py::test_streaming_apply": "https://github.com/pola-rs/polars/issues/22558",
210
+ # New iceberg release causes this test to fail. We can remove in the next polars version bump: https://github.com/rapidsai/cudf/pull/19912
211
+ "tests/unit/io/test_iceberg.py::test_fill_missing_fields_with_identity_partition_values[False]": "https://github.com/pola-rs/polars/pull/24456",
212
+ }
213
+
214
+
215
+ def pytest_collection_modifyitems(
216
+ session: pytest.Session, config: pytest.Config, items: list[pytest.Item]
217
+ ) -> None:
218
+ """Mark known failing tests."""
219
+ if config.getoption("--cudf-polars-no-fallback"):
220
+ # Don't xfail tests if running without fallback
221
+ return
222
+ for item in items:
223
+ if (reason := TESTS_TO_SKIP.get(item.nodeid, None)) is not None:
224
+ item.add_marker(pytest.mark.skip(reason=reason))
225
+ elif (entry := EXPECTED_FAILURES.get(item.nodeid, None)) is not None:
226
+ if isinstance(entry, tuple):
227
+ # the second entry in the tuple is the condition to xfail on
228
+ reason, condition = entry
229
+ item.add_marker(
230
+ pytest.mark.xfail(
231
+ condition=condition,
232
+ reason=reason,
233
+ ),
234
+ )
235
+ else:
236
+ item.add_marker(pytest.mark.xfail(reason=entry))
@@ -0,0 +1,219 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ """Typing utilities for cudf_polars."""
5
+
6
+ from __future__ import annotations
7
+
8
+ import sys
9
+ from collections.abc import Hashable, MutableMapping
10
+ from typing import (
11
+ TYPE_CHECKING,
12
+ Any,
13
+ Literal,
14
+ NewType,
15
+ Protocol,
16
+ TypeVar,
17
+ Union,
18
+ )
19
+
20
+ import polars as pl
21
+ import polars.datatypes
22
+ from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
23
+
24
+ if TYPE_CHECKING:
25
+ from collections.abc import Callable
26
+ from typing import TypeAlias
27
+
28
+ import pylibcudf as plc
29
+
30
+ from cudf_polars.containers import DataFrame, DataType
31
+ from cudf_polars.dsl import nodebase
32
+
33
+
34
+ if sys.version_info >= (3, 11):
35
+ # Inheriting from TypeDict + Generic added in python 3.11
36
+ from typing import TypedDict # pragma: no cover
37
+ else:
38
+ from typing_extensions import TypedDict # pragma: no cover
39
+
40
+
41
+ __all__: list[str] = [
42
+ "ClosedInterval",
43
+ "ColumnHeader",
44
+ "ColumnOptions",
45
+ "DataFrameHeader",
46
+ "GenericTransformer",
47
+ "NodeTraverser",
48
+ "OptimizationArgs",
49
+ "PolarsExpr",
50
+ "PolarsIR",
51
+ "Schema",
52
+ "Slice",
53
+ ]
54
+
55
+ PolarsIR: TypeAlias = Union[
56
+ pl_ir.PythonScan,
57
+ pl_ir.Scan,
58
+ pl_ir.Cache,
59
+ pl_ir.DataFrameScan,
60
+ pl_ir.Select,
61
+ pl_ir.GroupBy,
62
+ pl_ir.Join,
63
+ pl_ir.HStack,
64
+ pl_ir.Distinct,
65
+ pl_ir.Sort,
66
+ pl_ir.Slice,
67
+ pl_ir.Filter,
68
+ pl_ir.SimpleProjection,
69
+ pl_ir.MapFunction,
70
+ pl_ir.Union,
71
+ pl_ir.HConcat,
72
+ pl_ir.ExtContext,
73
+ ]
74
+
75
+ PolarsExpr: TypeAlias = Union[
76
+ pl_expr.Function,
77
+ pl_expr.Window,
78
+ pl_expr.Literal,
79
+ pl_expr.Sort,
80
+ pl_expr.SortBy,
81
+ pl_expr.Gather,
82
+ pl_expr.Filter,
83
+ pl_expr.Cast,
84
+ pl_expr.Column,
85
+ pl_expr.Agg,
86
+ pl_expr.BinaryExpr,
87
+ pl_expr.Len,
88
+ pl_expr.PyExprIR,
89
+ ]
90
+
91
+ PolarsSchema: TypeAlias = dict[str, pl.DataType]
92
+ Schema: TypeAlias = dict[str, "DataType"]
93
+
94
+ PolarsDataType: TypeAlias = polars.datatypes.DataTypeClass | polars.datatypes.DataType
95
+
96
+ Slice: TypeAlias = tuple[int, int | None]
97
+
98
+ CSECache: TypeAlias = MutableMapping[int, tuple["DataFrame", int]]
99
+
100
+ ClosedInterval: TypeAlias = Literal["left", "right", "both", "none"]
101
+
102
+ Duration = NewType("Duration", tuple[int, int, int, int, bool, bool])
103
+
104
+
105
+ class NodeTraverser(Protocol):
106
+ """Abstract protocol for polars NodeTraverser."""
107
+
108
+ def get_node(self) -> int:
109
+ """Return current plan node id."""
110
+ ...
111
+
112
+ def set_node(self, n: int) -> None:
113
+ """Set the current plan node to n."""
114
+ ...
115
+
116
+ def view_current_node(self) -> PolarsIR:
117
+ """Convert current plan node to python rep."""
118
+ ...
119
+
120
+ def get_schema(self) -> PolarsSchema:
121
+ """Get the schema of the current plan node."""
122
+ ...
123
+
124
+ def get_dtype(self, n: int) -> pl.DataType:
125
+ """Get the datatype of the given expression id."""
126
+ ...
127
+
128
+ def view_expression(self, n: int) -> PolarsExpr:
129
+ """Convert the given expression to python rep."""
130
+ ...
131
+
132
+ def version(self) -> tuple[int, int]:
133
+ """The IR version as `(major, minor)`."""
134
+ ...
135
+
136
+ def set_udf(
137
+ self,
138
+ callback: Callable[[list[str] | None, str | None, int | None], pl.DataFrame],
139
+ ) -> None:
140
+ """Set the callback replacing the current node in the plan."""
141
+ ...
142
+
143
+
144
+ OptimizationArgs: TypeAlias = Literal[
145
+ "type_coercion",
146
+ "predicate_pushdown",
147
+ "projection_pushdown",
148
+ "simplify_expression",
149
+ "slice_pushdown",
150
+ "comm_subplan_elim",
151
+ "comm_subexpr_elim",
152
+ "cluster_with_columns",
153
+ "no_optimization",
154
+ ]
155
+
156
+
157
+ U_contra = TypeVar("U_contra", bound=Hashable, contravariant=True)
158
+ V_co = TypeVar("V_co", covariant=True)
159
+ StateT_co = TypeVar("StateT_co", covariant=True)
160
+ NodeT = TypeVar("NodeT", bound="nodebase.Node[Any]")
161
+
162
+
163
+ class GenericTransformer(Protocol[U_contra, V_co, StateT_co]):
164
+ """Abstract protocol for recursive visitors."""
165
+
166
+ def __call__(self, __value: U_contra) -> V_co:
167
+ """Apply the visitor to the node."""
168
+ ...
169
+
170
+ @property
171
+ def state(self) -> StateT_co:
172
+ """Transform-specific immutable state."""
173
+ ...
174
+
175
+
176
+ class ColumnOptions(TypedDict):
177
+ """
178
+ Column constructor options.
179
+
180
+ Notes
181
+ -----
182
+ Used to serialize Column and DataFrame containers.
183
+ """
184
+
185
+ is_sorted: plc.types.Sorted
186
+ order: plc.types.Order
187
+ null_order: plc.types.NullOrder
188
+ name: str | None
189
+ dtype: str
190
+
191
+
192
+ class DeserializedColumnOptions(TypedDict):
193
+ """
194
+ Deserialized Column constructor options.
195
+
196
+ Notes
197
+ -----
198
+ Used to deserialize Column and DataFrame containers.
199
+ """
200
+
201
+ is_sorted: plc.types.Sorted
202
+ order: plc.types.Order
203
+ null_order: plc.types.NullOrder
204
+ name: str | None
205
+ dtype: DataType
206
+
207
+
208
+ class ColumnHeader(TypedDict):
209
+ """Column serialization header."""
210
+
211
+ column_kwargs: ColumnOptions
212
+ frame_count: int
213
+
214
+
215
+ class DataFrameHeader(TypedDict):
216
+ """DataFrame serialization header."""
217
+
218
+ columns_kwargs: list[ColumnOptions]
219
+ frame_count: int
@@ -0,0 +1,8 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ """Utilities."""
5
+
6
+ from __future__ import annotations
7
+
8
+ __all__: list[str] = []