cudf-polars-cu12 25.2.2__py3-none-any.whl → 25.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudf_polars/VERSION +1 -1
- cudf_polars/callback.py +82 -65
- cudf_polars/containers/column.py +138 -7
- cudf_polars/containers/dataframe.py +26 -39
- cudf_polars/dsl/expr.py +3 -1
- cudf_polars/dsl/expressions/aggregation.py +27 -63
- cudf_polars/dsl/expressions/base.py +40 -72
- cudf_polars/dsl/expressions/binaryop.py +5 -41
- cudf_polars/dsl/expressions/boolean.py +25 -53
- cudf_polars/dsl/expressions/datetime.py +97 -17
- cudf_polars/dsl/expressions/literal.py +27 -33
- cudf_polars/dsl/expressions/rolling.py +110 -9
- cudf_polars/dsl/expressions/selection.py +8 -26
- cudf_polars/dsl/expressions/slicing.py +47 -0
- cudf_polars/dsl/expressions/sorting.py +5 -18
- cudf_polars/dsl/expressions/string.py +33 -36
- cudf_polars/dsl/expressions/ternary.py +3 -10
- cudf_polars/dsl/expressions/unary.py +35 -75
- cudf_polars/dsl/ir.py +749 -212
- cudf_polars/dsl/nodebase.py +8 -1
- cudf_polars/dsl/to_ast.py +5 -3
- cudf_polars/dsl/translate.py +319 -171
- cudf_polars/dsl/utils/__init__.py +8 -0
- cudf_polars/dsl/utils/aggregations.py +292 -0
- cudf_polars/dsl/utils/groupby.py +97 -0
- cudf_polars/dsl/utils/naming.py +34 -0
- cudf_polars/dsl/utils/replace.py +46 -0
- cudf_polars/dsl/utils/rolling.py +113 -0
- cudf_polars/dsl/utils/windows.py +186 -0
- cudf_polars/experimental/base.py +17 -19
- cudf_polars/experimental/benchmarks/__init__.py +4 -0
- cudf_polars/experimental/benchmarks/pdsh.py +1279 -0
- cudf_polars/experimental/dask_registers.py +196 -0
- cudf_polars/experimental/distinct.py +174 -0
- cudf_polars/experimental/explain.py +127 -0
- cudf_polars/experimental/expressions.py +521 -0
- cudf_polars/experimental/groupby.py +288 -0
- cudf_polars/experimental/io.py +58 -29
- cudf_polars/experimental/join.py +353 -0
- cudf_polars/experimental/parallel.py +166 -93
- cudf_polars/experimental/repartition.py +69 -0
- cudf_polars/experimental/scheduler.py +155 -0
- cudf_polars/experimental/select.py +92 -7
- cudf_polars/experimental/shuffle.py +294 -0
- cudf_polars/experimental/sort.py +45 -0
- cudf_polars/experimental/spilling.py +151 -0
- cudf_polars/experimental/utils.py +100 -0
- cudf_polars/testing/asserts.py +146 -6
- cudf_polars/testing/io.py +72 -0
- cudf_polars/testing/plugin.py +78 -76
- cudf_polars/typing/__init__.py +59 -6
- cudf_polars/utils/config.py +353 -0
- cudf_polars/utils/conversion.py +40 -0
- cudf_polars/utils/dtypes.py +22 -5
- cudf_polars/utils/timer.py +39 -0
- cudf_polars/utils/versions.py +5 -4
- {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/METADATA +10 -7
- cudf_polars_cu12-25.6.0.dist-info/RECORD +73 -0
- {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/WHEEL +1 -1
- cudf_polars/experimental/dask_serialize.py +0 -59
- cudf_polars_cu12-25.2.2.dist-info/RECORD +0 -48
- {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info/licenses}/LICENSE +0 -0
- {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/top_level.txt +0 -0
cudf_polars/testing/plugin.py
CHANGED
|
@@ -31,9 +31,12 @@ def pytest_addoption(parser: pytest.Parser) -> None:
|
|
|
31
31
|
def pytest_configure(config: pytest.Config) -> None:
|
|
32
32
|
"""Enable use of this module as a pytest plugin to enable GPU collection."""
|
|
33
33
|
no_fallback = config.getoption("--cudf-polars-no-fallback")
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
34
|
+
if no_fallback:
|
|
35
|
+
collect = polars.LazyFrame.collect
|
|
36
|
+
engine = polars.GPUEngine(raise_on_fail=no_fallback)
|
|
37
|
+
polars.LazyFrame.collect = partialmethod(collect, engine=engine)
|
|
38
|
+
else:
|
|
39
|
+
polars.Config.set_engine_affinity("gpu")
|
|
37
40
|
config.addinivalue_line(
|
|
38
41
|
"filterwarnings",
|
|
39
42
|
"ignore:.*GPU engine does not support streaming or background collection",
|
|
@@ -51,22 +54,56 @@ EXPECTED_FAILURES: Mapping[str, str | tuple[str, bool]] = {
|
|
|
51
54
|
"tests/unit/io/test_delta.py::test_scan_delta_relative": "Need to expose hive partitioning",
|
|
52
55
|
"tests/unit/io/test_delta.py::test_read_delta_version": "Need to expose hive partitioning",
|
|
53
56
|
"tests/unit/io/test_lazy_count_star.py::test_count_compressed_csv_18057": "Need to determine if file is compressed",
|
|
57
|
+
"tests/unit/io/test_lazy_count_star.py::test_count_csv[foods1.csv-27]": "Need fast count for CSV scan",
|
|
58
|
+
"tests/unit/io/test_lazy_count_star.py::test_count_csv[foods*.csv-135]": "Need fast count for CSV scan",
|
|
59
|
+
"tests/unit/io/test_lazy_count_star.py::test_count_parquet[small.parquet-4]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
|
|
60
|
+
"tests/unit/io/test_lazy_count_star.py::test_count_parquet[foods*.parquet-54]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
|
|
61
|
+
"tests/unit/io/test_lazy_count_star.py::test_commented_csv": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
|
|
62
|
+
"tests/unit/io/test_lazy_count_star.py::test_count_ndjson[foods1.ndjson-27]": "Need fast count for JSON scan",
|
|
63
|
+
"tests/unit/io/test_lazy_count_star.py::test_count_ndjson[foods*.ndjson-54]": "Need fast count for JSON scan",
|
|
64
|
+
"tests/unit/io/test_lazy_count_star.py::test_count_compressed_ndjson": "Need fast count for JSON scan",
|
|
54
65
|
"tests/unit/io/test_lazy_csv.py::test_scan_csv_slice_offset_zero": "Integer overflow in sliced read",
|
|
55
|
-
"tests/unit/io/test_lazy_parquet.py::test_dsl2ir_cached_metadata[False]": "cudf-polars doesn't use metadata read by rust preprocessing",
|
|
56
66
|
"tests/unit/io/test_lazy_parquet.py::test_parquet_is_in_statistics": "Debug output on stderr doesn't match",
|
|
57
67
|
"tests/unit/io/test_lazy_parquet.py::test_parquet_statistics": "Debug output on stderr doesn't match",
|
|
58
|
-
"tests/unit/io/
|
|
59
|
-
"tests/unit/io/
|
|
60
|
-
"tests/unit/io/
|
|
61
|
-
"tests/unit/io/
|
|
62
|
-
"tests/unit/io/
|
|
63
|
-
"tests/unit/io/
|
|
68
|
+
"tests/unit/io/test_partition.py::test_partition_to_memory[io_type0]": "partition sinks not yet supported in standard engine.",
|
|
69
|
+
"tests/unit/io/test_partition.py::test_partition_to_memory[io_type1]": "partition sinks not yet supported in standard engine.",
|
|
70
|
+
"tests/unit/io/test_partition.py::test_partition_to_memory[io_type2]": "partition sinks not yet supported in standard engine.",
|
|
71
|
+
"tests/unit/io/test_partition.py::test_partition_to_memory[io_type3]": "partition sinks not yet supported in standard engine.",
|
|
72
|
+
"tests/unit/io/test_partition.py::test_max_size_partition[1-1-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
|
|
73
|
+
"tests/unit/io/test_partition.py::test_max_size_partition[1-1-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
|
|
74
|
+
"tests/unit/io/test_partition.py::test_max_size_partition[1-4-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
|
|
75
|
+
"tests/unit/io/test_partition.py::test_max_size_partition[1-4-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
|
|
76
|
+
"tests/unit/io/test_partition.py::test_max_size_partition[1-5-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
|
|
77
|
+
"tests/unit/io/test_partition.py::test_max_size_partition[1-5-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
|
|
78
|
+
"tests/unit/io/test_partition.py::test_max_size_partition[1-6-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
|
|
79
|
+
"tests/unit/io/test_partition.py::test_max_size_partition[1-6-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
|
|
80
|
+
"tests/unit/io/test_partition.py::test_max_size_partition[1-7-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
|
|
81
|
+
"tests/unit/io/test_partition.py::test_max_size_partition[1-7-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
|
|
82
|
+
"tests/unit/io/test_partition.py::test_max_size_partition[2-1-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
|
|
83
|
+
"tests/unit/io/test_partition.py::test_max_size_partition[2-1-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
|
|
84
|
+
"tests/unit/io/test_partition.py::test_max_size_partition[2-4-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
|
|
85
|
+
"tests/unit/io/test_partition.py::test_max_size_partition[2-4-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
|
|
86
|
+
"tests/unit/io/test_partition.py::test_max_size_partition[2-5-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
|
|
87
|
+
"tests/unit/io/test_partition.py::test_max_size_partition[2-5-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
|
|
88
|
+
"tests/unit/io/test_partition.py::test_max_size_partition[2-6-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
|
|
89
|
+
"tests/unit/io/test_partition.py::test_max_size_partition[2-6-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
|
|
90
|
+
"tests/unit/io/test_partition.py::test_max_size_partition[2-7-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
|
|
91
|
+
"tests/unit/io/test_partition.py::test_max_size_partition[2-7-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
|
|
92
|
+
"tests/unit/io/test_partition.py::test_max_size_partition[3-1-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
|
|
93
|
+
"tests/unit/io/test_partition.py::test_max_size_partition[3-1-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
|
|
94
|
+
"tests/unit/io/test_partition.py::test_max_size_partition[3-4-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
|
|
95
|
+
"tests/unit/io/test_partition.py::test_max_size_partition[3-4-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
|
|
96
|
+
"tests/unit/io/test_partition.py::test_max_size_partition[3-5-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
|
|
97
|
+
"tests/unit/io/test_partition.py::test_max_size_partition[3-5-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
|
|
98
|
+
"tests/unit/io/test_partition.py::test_max_size_partition[3-6-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
|
|
99
|
+
"tests/unit/io/test_partition.py::test_max_size_partition[3-6-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
|
|
100
|
+
"tests/unit/io/test_partition.py::test_max_size_partition[3-7-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
|
|
101
|
+
"tests/unit/io/test_partition.py::test_max_size_partition[3-7-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
|
|
102
|
+
"tests/unit/io/test_partition.py::test_max_size_partition_lambda[io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
|
|
103
|
+
"tests/unit/io/test_partition.py::test_max_size_partition_lambda[io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
|
|
64
104
|
"tests/unit/io/test_lazy_parquet.py::test_scan_parquet_ignores_dtype_mismatch_for_non_projected_columns_19249[False-False]": "Needs some variant of cudf#16394",
|
|
65
105
|
"tests/unit/io/test_lazy_parquet.py::test_scan_parquet_ignores_dtype_mismatch_for_non_projected_columns_19249[True-False]": "Needs some variant of cudf#16394",
|
|
66
|
-
"tests/unit/io/test_lazy_parquet.py::test_parquet_slice_pushdown_non_zero_offset[False]": "Thrift data not handled correctly/slice pushdown wrong?",
|
|
67
106
|
"tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read[False]": "Incomplete handling of projected reads with mismatching schemas, cudf#16394",
|
|
68
|
-
"tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_dtype_mismatch[False]": "Different exception raised, but correctly raises an exception",
|
|
69
|
-
"tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_missing_cols_from_first[False]": "Different exception raised, but correctly raises an exception",
|
|
70
107
|
"tests/unit/io/test_parquet.py::test_read_parquet_only_loads_selected_columns_15098": "Memory usage won't be correct due to GPU",
|
|
71
108
|
"tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-none]": "Mismatching column read cudf#16394",
|
|
72
109
|
"tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-none]": "Mismatching column read cudf#16394",
|
|
@@ -84,48 +121,7 @@ EXPECTED_FAILURES: Mapping[str, str | tuple[str, bool]] = {
|
|
|
84
121
|
"tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-row_groups]": "Mismatching column read cudf#16394",
|
|
85
122
|
"tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-columns]": "Mismatching column read cudf#16394",
|
|
86
123
|
"tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-columns]": "Mismatching column read cudf#16394",
|
|
87
|
-
"tests/unit/io/
|
|
88
|
-
"tests/unit/io/test_scan.py::test_scan_with_limit[single-csv-async]": "Debug output on stderr doesn't match",
|
|
89
|
-
"tests/unit/io/test_scan.py::test_scan_with_filter[single-csv-async]": "Debug output on stderr doesn't match",
|
|
90
|
-
"tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[single-csv-async]": "Debug output on stderr doesn't match",
|
|
91
|
-
"tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[single-csv-async]": "Debug output on stderr doesn't match",
|
|
92
|
-
"tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[single-csv-async]": "Debug output on stderr doesn't match",
|
|
93
|
-
"tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[single-csv-async]": "Debug output on stderr doesn't match",
|
|
94
|
-
"tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[single-csv-async]": "Debug output on stderr doesn't match",
|
|
95
|
-
"tests/unit/io/test_scan.py::test_scan[glob-csv-async]": "Debug output on stderr doesn't match",
|
|
96
|
-
"tests/unit/io/test_scan.py::test_scan_with_limit[glob-csv-async]": "Debug output on stderr doesn't match",
|
|
97
|
-
"tests/unit/io/test_scan.py::test_scan_with_filter[glob-csv-async]": "Debug output on stderr doesn't match",
|
|
98
|
-
"tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[glob-csv-async]": "Debug output on stderr doesn't match",
|
|
99
|
-
"tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[glob-csv-async]": "Debug output on stderr doesn't match",
|
|
100
|
-
"tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[glob-csv-async]": "Debug output on stderr doesn't match",
|
|
101
|
-
"tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[glob-csv-async]": "Debug output on stderr doesn't match",
|
|
102
|
-
"tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[glob-csv-async]": "Debug output on stderr doesn't match",
|
|
103
|
-
"tests/unit/io/test_scan.py::test_scan[glob-parquet-async]": "Debug output on stderr doesn't match",
|
|
104
|
-
"tests/unit/io/test_scan.py::test_scan_with_limit[glob-parquet-async]": "Debug output on stderr doesn't match",
|
|
105
|
-
"tests/unit/io/test_scan.py::test_scan_with_filter[glob-parquet-async]": "Debug output on stderr doesn't match",
|
|
106
|
-
"tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[glob-parquet-async]": "Debug output on stderr doesn't match",
|
|
107
|
-
"tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[glob-parquet-async]": "Debug output on stderr doesn't match",
|
|
108
|
-
"tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[glob-parquet-async]": "Debug output on stderr doesn't match",
|
|
109
|
-
"tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[glob-parquet-async]": "Debug output on stderr doesn't match",
|
|
110
|
-
"tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[glob-parquet-async]": "Debug output on stderr doesn't match",
|
|
111
|
-
"tests/unit/io/test_scan.py::test_scan_with_row_index_projected_out[glob-parquet-async]": "Debug output on stderr doesn't match",
|
|
112
|
-
"tests/unit/io/test_scan.py::test_scan_with_row_index_filter_and_limit[glob-parquet-async]": "Debug output on stderr doesn't match",
|
|
113
|
-
"tests/unit/io/test_scan.py::test_scan[single-parquet-async]": "Debug output on stderr doesn't match",
|
|
114
|
-
"tests/unit/io/test_scan.py::test_scan_with_limit[single-parquet-async]": "Debug output on stderr doesn't match",
|
|
115
|
-
"tests/unit/io/test_scan.py::test_scan_with_filter[single-parquet-async]": "Debug output on stderr doesn't match",
|
|
116
|
-
"tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[single-parquet-async]": "Debug output on stderr doesn't match",
|
|
117
|
-
"tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[single-parquet-async]": "Debug output on stderr doesn't match",
|
|
118
|
-
"tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[single-parquet-async]": "Debug output on stderr doesn't match",
|
|
119
|
-
"tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[single-parquet-async]": "Debug output on stderr doesn't match",
|
|
120
|
-
"tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[single-parquet-async]": "Debug output on stderr doesn't match",
|
|
121
|
-
"tests/unit/io/test_scan.py::test_scan_with_row_index_projected_out[single-parquet-async]": "Debug output on stderr doesn't match",
|
|
122
|
-
"tests/unit/io/test_scan.py::test_scan_with_row_index_filter_and_limit[single-parquet-async]": "Debug output on stderr doesn't match",
|
|
123
|
-
"tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_parquet-write_parquet]": "Need to add include_file_path to IR",
|
|
124
|
-
"tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_csv-write_csv]": "Need to add include_file_path to IR",
|
|
125
|
-
"tests/unit/io/test_scan.py::test_scan_include_file_paths[False-scan_parquet-write_parquet]": "Debug output on stderr doesn't match",
|
|
126
|
-
"tests/unit/io/test_scan.py::test_scan_include_file_paths[False-scan_csv-write_csv]": "Debug output on stderr doesn't match",
|
|
127
|
-
"tests/unit/io/test_scan.py::test_scan_include_file_paths[False-scan_ndjson-write_ndjson]": "Debug output on stderr doesn't match",
|
|
128
|
-
"tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_ndjson-write_ndjson]": "Need to add include_file_path to IR",
|
|
124
|
+
"tests/unit/io/test_parquet.py::test_scan_parquet_filter_statistics_load_missing_column_21391": "Mismatching column read cudf#16394",
|
|
129
125
|
"tests/unit/io/test_write.py::test_write_async[read_parquet-write_parquet]": "Need to add include_file_path to IR",
|
|
130
126
|
"tests/unit/io/test_write.py::test_write_async[<lambda>-write_csv]": "Need to add include_file_path to IR",
|
|
131
127
|
"tests/unit/io/test_write.py::test_write_async[read_parquet-<lambda>]": "Need to add include_file_path to IR",
|
|
@@ -136,8 +132,6 @@ EXPECTED_FAILURES: Mapping[str, str | tuple[str, bool]] = {
|
|
|
136
132
|
"tests/unit/lazyframe/test_lazyframe.py::test_round[dtype1-123.55-1-123.6]": "Rounding midpoints is handled incorrectly",
|
|
137
133
|
"tests/unit/lazyframe/test_lazyframe.py::test_cast_frame": "Casting that raises not supported on GPU",
|
|
138
134
|
"tests/unit/lazyframe/test_lazyframe.py::test_lazy_cache_hit": "Debug output on stderr doesn't match",
|
|
139
|
-
"tests/unit/operations/aggregation/test_aggregations.py::test_duration_function_literal": "Broadcasting inside groupby-agg not supported",
|
|
140
|
-
"tests/unit/operations/aggregation/test_aggregations.py::test_sum_empty_and_null_set": "libcudf sums column of all nulls to null, not zero",
|
|
141
135
|
"tests/unit/operations/aggregation/test_aggregations.py::test_binary_op_agg_context_no_simplify_expr_12423": "groupby-agg of just literals should not produce collect_list",
|
|
142
136
|
"tests/unit/operations/aggregation/test_aggregations.py::test_nan_inf_aggregation": "treatment of nans and nulls together is different in libcudf and polars in groupby-agg context",
|
|
143
137
|
"tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func0-none]": "cudf-polars doesn't nullify division by zero",
|
|
@@ -165,6 +159,7 @@ EXPECTED_FAILURES: Mapping[str, str | tuple[str, bool]] = {
|
|
|
165
159
|
"tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_both-none]": "cudf-polars doesn't nullify division by zero",
|
|
166
160
|
"tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_none-none]": "cudf-polars doesn't nullify division by zero",
|
|
167
161
|
"tests/unit/operations/test_abs.py::test_abs_duration": "Need to raise for unsupported uops on timelike values",
|
|
162
|
+
"tests/unit/operations/test_group_by.py::test_group_by_shorthand_quantile": "libcudf quantiles are round to nearest ties to even, polars quantiles are round to nearest ties away from zero",
|
|
168
163
|
"tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input7-expected7-Float32-Float32]": "Mismatching dtypes, needs cudf#15852",
|
|
169
164
|
"tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype",
|
|
170
165
|
"tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input11-expected11-input_dtype11-output_dtype11]": "Unsupported groupby-agg for a particular dtype",
|
|
@@ -180,26 +175,30 @@ EXPECTED_FAILURES: Mapping[str, str | tuple[str, bool]] = {
|
|
|
180
175
|
"tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input16-expected16-input_dtype16-output_dtype16]": "Unsupported groupby-agg for a particular dtype",
|
|
181
176
|
"tests/unit/operations/test_group_by.py::test_group_by_binary_agg_with_literal": "Incorrect broadcasting of literals in groupby-agg",
|
|
182
177
|
"tests/unit/operations/test_group_by.py::test_group_by_lit_series": "Incorrect broadcasting of literals in groupby-agg",
|
|
183
|
-
"tests/unit/operations/test_group_by.py::test_aggregated_scalar_elementwise_15602": "Unsupported boolean function/dtype combination in groupby-agg",
|
|
184
|
-
"tests/unit/operations/test_group_by.py::test_schemas[data1-expr1-expected_select1-expected_gb1]": "Mismatching dtypes, needs cudf#15852",
|
|
185
178
|
"tests/unit/operations/test_join.py::test_cross_join_slice_pushdown": "Need to implement slice pushdown for cross joins",
|
|
179
|
+
"tests/unit/operations/test_rolling.py::test_rolling_group_by_empty_groups_by_take_6330": "Ordering difference, might be polars bug",
|
|
186
180
|
"tests/unit/sql/test_cast.py::test_cast_errors[values0-values::uint8-conversion from `f64` to `u64` failed]": "Casting that raises not supported on GPU",
|
|
187
181
|
"tests/unit/sql/test_cast.py::test_cast_errors[values1-values::uint4-conversion from `i64` to `u32` failed]": "Casting that raises not supported on GPU",
|
|
188
182
|
"tests/unit/sql/test_cast.py::test_cast_errors[values2-values::int1-conversion from `i64` to `i8` failed]": "Casting that raises not supported on GPU",
|
|
189
183
|
"tests/unit/sql/test_cast.py::test_cast_errors[values5-values::int4-conversion from `str` to `i32` failed]": "Cast raises, but error user receives is wrong",
|
|
190
184
|
"tests/unit/sql/test_miscellaneous.py::test_read_csv": "Incorrect handling of missing_is_null in read_csv",
|
|
191
185
|
"tests/unit/sql/test_wildcard_opts.py::test_select_wildcard_errors": "Raises correctly but with different exception",
|
|
192
|
-
"tests/unit/streaming/test_streaming_io.py::test_parquet_eq_statistics": "Debug output on stderr doesn't match",
|
|
193
186
|
"tests/unit/test_cse.py::test_cse_predicate_self_join": "Debug output on stderr doesn't match",
|
|
187
|
+
"tests/unit/test_cse.py::test_nested_cache_no_panic_16553": "Needs https://github.com/rapidsai/cudf/issues/18630",
|
|
194
188
|
"tests/unit/test_empty.py::test_empty_9137": "Mismatching dtypes, needs cudf#15852",
|
|
195
189
|
"tests/unit/test_errors.py::test_error_on_empty_group_by": "Incorrect exception raised",
|
|
196
|
-
|
|
197
|
-
"tests/unit/
|
|
198
|
-
"tests/unit/
|
|
190
|
+
"tests/unit/streaming/test_streaming_io.py::test_sink_phases[parquet]": "Debug output on stderr doesn't match",
|
|
191
|
+
"tests/unit/streaming/test_streaming_io.py::test_sink_phases[ndjson]": "Debug output on stderr doesn't match",
|
|
192
|
+
"tests/unit/io/test_scan.py::test_async_read_21945[scan_type0]": "Debug output on stderr doesn't match",
|
|
193
|
+
"tests/unit/io/test_scan.py::test_async_read_21945[scan_type1]": "Debug output on stderr doesn't match",
|
|
194
|
+
"tests/unit/io/test_scan.py::test_async_read_21945[scan_type2]": "Debug output on stderr doesn't match",
|
|
195
|
+
"tests/unit/io/test_scan.py::test_async_read_21945[scan_type3]": "Debug output on stderr doesn't match",
|
|
196
|
+
"tests/unit/io/test_multiscan.py::test_multiscan_row_index[scan_csv-write_csv-csv]": "Debug output on stderr doesn't match",
|
|
199
197
|
}
|
|
200
198
|
|
|
201
199
|
|
|
202
200
|
TESTS_TO_SKIP: Mapping[str, str] = {
|
|
201
|
+
"tests/unit/operations/test_profile.py::test_profile_with_cse": "Shape assertion won't match",
|
|
203
202
|
# On Ubuntu 20.04, the tzdata package contains a bunch of symlinks
|
|
204
203
|
# for obsolete timezone names. However, the chrono_tz package that
|
|
205
204
|
# polars uses doesn't read /usr/share/zoneinfo, instead packaging
|
|
@@ -209,15 +208,19 @@ TESTS_TO_SKIP: Mapping[str, str] = {
|
|
|
209
208
|
# polars that the requested timezone is unknown.
|
|
210
209
|
# Since this is random, just skip it, rather than xfailing.
|
|
211
210
|
"tests/unit/lazyframe/test_serde.py::test_lf_serde_roundtrip_binary": "chrono_tz doesn't have all tzdata symlink names",
|
|
211
|
+
# Tests performance difference of CPU engine
|
|
212
|
+
"tests/unit/operations/test_join.py::test_join_where_eager_perf_21145": "Tests performance bug in CPU engine",
|
|
212
213
|
# The test may segfault with the legacy streaming engine. We should
|
|
213
214
|
# remove this skip when all polars tests use the new streaming engine.
|
|
214
215
|
"tests/unit/streaming/test_streaming_group_by.py::test_streaming_group_by_literal[1]": "May segfault w/the legacy streaming engine",
|
|
215
216
|
# Fails in CI, but passes locally
|
|
216
217
|
"tests/unit/streaming/test_streaming.py::test_streaming_streamable_functions": "RuntimeError: polars_python::sql::PySQLContext is unsendable, but is being dropped on another thread",
|
|
217
|
-
#
|
|
218
|
-
"tests/unit/
|
|
219
|
-
"tests/unit/
|
|
220
|
-
"tests/unit/
|
|
218
|
+
# Remove when polars supports Pydantic V3
|
|
219
|
+
"tests/unit/constructors/test_constructors.py::test_init_structured_objects": "pydantic deprecation warning",
|
|
220
|
+
"tests/unit/constructors/test_constructors.py::test_init_pydantic_2x": "pydantic deprecation warning",
|
|
221
|
+
"tests/unit/constructors/test_constructors.py::test_init_structured_objects_nested[_TestFooPD-_TestBarPD-_TestBazPD]": "pydantic deprecation warning",
|
|
222
|
+
"tests/unit/series/test_series.py::test_init_structured_objects": "pydantic deprecation warning",
|
|
223
|
+
"tests/unit/streaming/test_streaming.py::test_streaming_apply": "https://github.com/pola-rs/polars/issues/22558",
|
|
221
224
|
}
|
|
222
225
|
|
|
223
226
|
|
|
@@ -229,18 +232,17 @@ def pytest_collection_modifyitems(
|
|
|
229
232
|
# Don't xfail tests if running without fallback
|
|
230
233
|
return
|
|
231
234
|
for item in items:
|
|
232
|
-
if item.nodeid
|
|
233
|
-
item.add_marker(pytest.mark.skip(reason=
|
|
234
|
-
elif item.nodeid
|
|
235
|
-
if isinstance(
|
|
235
|
+
if (reason := TESTS_TO_SKIP.get(item.nodeid, None)) is not None:
|
|
236
|
+
item.add_marker(pytest.mark.skip(reason=reason))
|
|
237
|
+
elif (entry := EXPECTED_FAILURES.get(item.nodeid, None)) is not None:
|
|
238
|
+
if isinstance(entry, tuple):
|
|
236
239
|
# the second entry in the tuple is the condition to xfail on
|
|
240
|
+
reason, condition = entry
|
|
237
241
|
item.add_marker(
|
|
238
242
|
pytest.mark.xfail(
|
|
239
|
-
condition=
|
|
240
|
-
reason=
|
|
243
|
+
condition=condition,
|
|
244
|
+
reason=reason,
|
|
241
245
|
),
|
|
242
246
|
)
|
|
243
247
|
else:
|
|
244
|
-
item.add_marker(
|
|
245
|
-
pytest.mark.xfail(reason=EXPECTED_FAILURES[item.nodeid])
|
|
246
|
-
)
|
|
248
|
+
item.add_marker(pytest.mark.xfail(reason=entry))
|
cudf_polars/typing/__init__.py
CHANGED
|
@@ -1,26 +1,40 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
4
|
"""Typing utilities for cudf_polars."""
|
|
5
5
|
|
|
6
6
|
from __future__ import annotations
|
|
7
7
|
|
|
8
|
-
from collections.abc import Hashable,
|
|
9
|
-
from typing import
|
|
8
|
+
from collections.abc import Hashable, MutableMapping
|
|
9
|
+
from typing import (
|
|
10
|
+
TYPE_CHECKING,
|
|
11
|
+
Any,
|
|
12
|
+
Literal,
|
|
13
|
+
NewType,
|
|
14
|
+
Protocol,
|
|
15
|
+
TypeVar,
|
|
16
|
+
TypedDict,
|
|
17
|
+
Union,
|
|
18
|
+
)
|
|
10
19
|
|
|
11
20
|
from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
|
|
12
21
|
|
|
13
22
|
import pylibcudf as plc
|
|
14
23
|
|
|
15
24
|
if TYPE_CHECKING:
|
|
16
|
-
from collections.abc import Callable
|
|
25
|
+
from collections.abc import Callable, Mapping
|
|
17
26
|
from typing import TypeAlias
|
|
18
27
|
|
|
19
28
|
import polars as pl
|
|
20
29
|
|
|
30
|
+
from cudf_polars.containers import DataFrame
|
|
21
31
|
from cudf_polars.dsl import expr, ir, nodebase
|
|
22
32
|
|
|
23
33
|
__all__: list[str] = [
|
|
34
|
+
"ClosedInterval",
|
|
35
|
+
"ColumnHeader",
|
|
36
|
+
"ColumnOptions",
|
|
37
|
+
"DataFrameHeader",
|
|
24
38
|
"ExprTransformer",
|
|
25
39
|
"GenericTransformer",
|
|
26
40
|
"IRTransformer",
|
|
@@ -28,6 +42,8 @@ __all__: list[str] = [
|
|
|
28
42
|
"OptimizationArgs",
|
|
29
43
|
"PolarsExpr",
|
|
30
44
|
"PolarsIR",
|
|
45
|
+
"Schema",
|
|
46
|
+
"Slice",
|
|
31
47
|
]
|
|
32
48
|
|
|
33
49
|
PolarsIR: TypeAlias = Union[
|
|
@@ -66,7 +82,15 @@ PolarsExpr: TypeAlias = Union[
|
|
|
66
82
|
pl_expr.PyExprIR,
|
|
67
83
|
]
|
|
68
84
|
|
|
69
|
-
Schema: TypeAlias =
|
|
85
|
+
Schema: TypeAlias = dict[str, plc.DataType]
|
|
86
|
+
|
|
87
|
+
Slice: TypeAlias = tuple[int, int | None]
|
|
88
|
+
|
|
89
|
+
CSECache: TypeAlias = MutableMapping[int, tuple["DataFrame", int]]
|
|
90
|
+
|
|
91
|
+
ClosedInterval: TypeAlias = Literal["left", "right", "both", "none"]
|
|
92
|
+
|
|
93
|
+
Duration = NewType("Duration", tuple[int, int, int, int, bool, bool])
|
|
70
94
|
|
|
71
95
|
|
|
72
96
|
class NodeTraverser(Protocol):
|
|
@@ -84,7 +108,7 @@ class NodeTraverser(Protocol):
|
|
|
84
108
|
"""Convert current plan node to python rep."""
|
|
85
109
|
...
|
|
86
110
|
|
|
87
|
-
def get_schema(self) ->
|
|
111
|
+
def get_schema(self) -> Schema:
|
|
88
112
|
"""Get the schema of the current plan node."""
|
|
89
113
|
...
|
|
90
114
|
|
|
@@ -145,3 +169,32 @@ ExprTransformer: TypeAlias = GenericTransformer["expr.Expr", "expr.Expr"]
|
|
|
145
169
|
|
|
146
170
|
IRTransformer: TypeAlias = GenericTransformer["ir.IR", "ir.IR"]
|
|
147
171
|
"""Protocol for transformation of IR nodes."""
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
class ColumnOptions(TypedDict):
|
|
175
|
+
"""
|
|
176
|
+
Column constructor options.
|
|
177
|
+
|
|
178
|
+
Notes
|
|
179
|
+
-----
|
|
180
|
+
Used to serialize Column and DataFrame containers.
|
|
181
|
+
"""
|
|
182
|
+
|
|
183
|
+
is_sorted: plc.types.Sorted
|
|
184
|
+
order: plc.types.Order
|
|
185
|
+
null_order: plc.types.NullOrder
|
|
186
|
+
name: str | None
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
class ColumnHeader(TypedDict):
|
|
190
|
+
"""Column serialization header."""
|
|
191
|
+
|
|
192
|
+
column_kwargs: ColumnOptions
|
|
193
|
+
frame_count: int
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
class DataFrameHeader(TypedDict):
|
|
197
|
+
"""DataFrame serialization header."""
|
|
198
|
+
|
|
199
|
+
columns_kwargs: list[ColumnOptions]
|
|
200
|
+
frame_count: int
|