PyPI - cudf-polars-cu12 - Versions diffs - 25.2.2__py3-none-any.whl → 25.6.0__py3-none-any.whl - Mend

cudf-polars-cu12 25.2.2py3-none-any.whl → 25.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

cudf_polars/VERSION +1 -1
cudf_polars/callback.py +82 -65
cudf_polars/containers/column.py +138 -7
cudf_polars/containers/dataframe.py +26 -39
cudf_polars/dsl/expr.py +3 -1
cudf_polars/dsl/expressions/aggregation.py +27 -63
cudf_polars/dsl/expressions/base.py +40 -72
cudf_polars/dsl/expressions/binaryop.py +5 -41
cudf_polars/dsl/expressions/boolean.py +25 -53
cudf_polars/dsl/expressions/datetime.py +97 -17
cudf_polars/dsl/expressions/literal.py +27 -33
cudf_polars/dsl/expressions/rolling.py +110 -9
cudf_polars/dsl/expressions/selection.py +8 -26
cudf_polars/dsl/expressions/slicing.py +47 -0
cudf_polars/dsl/expressions/sorting.py +5 -18
cudf_polars/dsl/expressions/string.py +33 -36
cudf_polars/dsl/expressions/ternary.py +3 -10
cudf_polars/dsl/expressions/unary.py +35 -75
cudf_polars/dsl/ir.py +749 -212
cudf_polars/dsl/nodebase.py +8 -1
cudf_polars/dsl/to_ast.py +5 -3
cudf_polars/dsl/translate.py +319 -171
cudf_polars/dsl/utils/__init__.py +8 -0
cudf_polars/dsl/utils/aggregations.py +292 -0
cudf_polars/dsl/utils/groupby.py +97 -0
cudf_polars/dsl/utils/naming.py +34 -0
cudf_polars/dsl/utils/replace.py +46 -0
cudf_polars/dsl/utils/rolling.py +113 -0
cudf_polars/dsl/utils/windows.py +186 -0
cudf_polars/experimental/base.py +17 -19
cudf_polars/experimental/benchmarks/__init__.py +4 -0
cudf_polars/experimental/benchmarks/pdsh.py +1279 -0
cudf_polars/experimental/dask_registers.py +196 -0
cudf_polars/experimental/distinct.py +174 -0
cudf_polars/experimental/explain.py +127 -0
cudf_polars/experimental/expressions.py +521 -0
cudf_polars/experimental/groupby.py +288 -0
cudf_polars/experimental/io.py +58 -29
cudf_polars/experimental/join.py +353 -0
cudf_polars/experimental/parallel.py +166 -93
cudf_polars/experimental/repartition.py +69 -0
cudf_polars/experimental/scheduler.py +155 -0
cudf_polars/experimental/select.py +92 -7
cudf_polars/experimental/shuffle.py +294 -0
cudf_polars/experimental/sort.py +45 -0
cudf_polars/experimental/spilling.py +151 -0
cudf_polars/experimental/utils.py +100 -0
cudf_polars/testing/asserts.py +146 -6
cudf_polars/testing/io.py +72 -0
cudf_polars/testing/plugin.py +78 -76
cudf_polars/typing/__init__.py +59 -6
cudf_polars/utils/config.py +353 -0
cudf_polars/utils/conversion.py +40 -0
cudf_polars/utils/dtypes.py +22 -5
cudf_polars/utils/timer.py +39 -0
cudf_polars/utils/versions.py +5 -4
{cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/METADATA +10 -7
cudf_polars_cu12-25.6.0.dist-info/RECORD +73 -0
{cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/WHEEL +1 -1
cudf_polars/experimental/dask_serialize.py +0 -59
cudf_polars_cu12-25.2.2.dist-info/RECORD +0 -48
{cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info/licenses}/LICENSE +0 -0
{cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/top_level.txt +0 -0

cudf_polars/testing/plugin.py CHANGED Viewed

@@ -31,9 +31,12 @@ def pytest_addoption(parser: pytest.Parser) -> None:
 def pytest_configure(config: pytest.Config) -> None:
     """Enable use of this module as a pytest plugin to enable GPU collection."""
     no_fallback = config.getoption("--cudf-polars-no-fallback")
-    collect = polars.LazyFrame.collect
-    engine = polars.GPUEngine(raise_on_fail=no_fallback)
-    polars.LazyFrame.collect = partialmethod(collect, engine=engine)
+    if no_fallback:
+        collect = polars.LazyFrame.collect
+        engine = polars.GPUEngine(raise_on_fail=no_fallback)
+        polars.LazyFrame.collect = partialmethod(collect, engine=engine)
+    else:
+        polars.Config.set_engine_affinity("gpu")
     config.addinivalue_line(
         "filterwarnings",
         "ignore:.*GPU engine does not support streaming or background collection",
@@ -51,22 +54,56 @@ EXPECTED_FAILURES: Mapping[str, str | tuple[str, bool]] = {
     "tests/unit/io/test_delta.py::test_scan_delta_relative": "Need to expose hive partitioning",
     "tests/unit/io/test_delta.py::test_read_delta_version": "Need to expose hive partitioning",
     "tests/unit/io/test_lazy_count_star.py::test_count_compressed_csv_18057": "Need to determine if file is compressed",
+    "tests/unit/io/test_lazy_count_star.py::test_count_csv[foods1.csv-27]": "Need fast count for CSV scan",
+    "tests/unit/io/test_lazy_count_star.py::test_count_csv[foods*.csv-135]": "Need fast count for CSV scan",
+    "tests/unit/io/test_lazy_count_star.py::test_count_parquet[small.parquet-4]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
+    "tests/unit/io/test_lazy_count_star.py::test_count_parquet[foods*.parquet-54]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
+    "tests/unit/io/test_lazy_count_star.py::test_commented_csv": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
+    "tests/unit/io/test_lazy_count_star.py::test_count_ndjson[foods1.ndjson-27]": "Need fast count for JSON scan",
+    "tests/unit/io/test_lazy_count_star.py::test_count_ndjson[foods*.ndjson-54]": "Need fast count for JSON scan",
+    "tests/unit/io/test_lazy_count_star.py::test_count_compressed_ndjson": "Need fast count for JSON scan",
     "tests/unit/io/test_lazy_csv.py::test_scan_csv_slice_offset_zero": "Integer overflow in sliced read",
-    "tests/unit/io/test_lazy_parquet.py::test_dsl2ir_cached_metadata[False]": "cudf-polars doesn't use metadata read by rust preprocessing",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_is_in_statistics": "Debug output on stderr doesn't match",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_statistics": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_lazy_parquet.py::test_parquet_different_schema[False]": "Needs cudf#16394",
-    "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-columns]": "Correctly raises but different error",
-    "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-row_groups]": "Correctly raises but different error",
-    "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-prefiltered]": "Correctly raises but different error",
-    "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-none]": "Correctly raises but different error",
-    "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_mismatch_panic_17067[False]": "Needs cudf#16394",
+    "tests/unit/io/test_partition.py::test_partition_to_memory[io_type0]": "partition sinks not yet supported in standard engine.",
+    "tests/unit/io/test_partition.py::test_partition_to_memory[io_type1]": "partition sinks not yet supported in standard engine.",
+    "tests/unit/io/test_partition.py::test_partition_to_memory[io_type2]": "partition sinks not yet supported in standard engine.",
+    "tests/unit/io/test_partition.py::test_partition_to_memory[io_type3]": "partition sinks not yet supported in standard engine.",
+    "tests/unit/io/test_partition.py::test_max_size_partition[1-1-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
+    "tests/unit/io/test_partition.py::test_max_size_partition[1-1-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
+    "tests/unit/io/test_partition.py::test_max_size_partition[1-4-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
+    "tests/unit/io/test_partition.py::test_max_size_partition[1-4-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
+    "tests/unit/io/test_partition.py::test_max_size_partition[1-5-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
+    "tests/unit/io/test_partition.py::test_max_size_partition[1-5-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
+    "tests/unit/io/test_partition.py::test_max_size_partition[1-6-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
+    "tests/unit/io/test_partition.py::test_max_size_partition[1-6-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
+    "tests/unit/io/test_partition.py::test_max_size_partition[1-7-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
+    "tests/unit/io/test_partition.py::test_max_size_partition[1-7-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
+    "tests/unit/io/test_partition.py::test_max_size_partition[2-1-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
+    "tests/unit/io/test_partition.py::test_max_size_partition[2-1-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
+    "tests/unit/io/test_partition.py::test_max_size_partition[2-4-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
+    "tests/unit/io/test_partition.py::test_max_size_partition[2-4-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
+    "tests/unit/io/test_partition.py::test_max_size_partition[2-5-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
+    "tests/unit/io/test_partition.py::test_max_size_partition[2-5-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
+    "tests/unit/io/test_partition.py::test_max_size_partition[2-6-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
+    "tests/unit/io/test_partition.py::test_max_size_partition[2-6-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
+    "tests/unit/io/test_partition.py::test_max_size_partition[2-7-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
+    "tests/unit/io/test_partition.py::test_max_size_partition[2-7-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
+    "tests/unit/io/test_partition.py::test_max_size_partition[3-1-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
+    "tests/unit/io/test_partition.py::test_max_size_partition[3-1-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
+    "tests/unit/io/test_partition.py::test_max_size_partition[3-4-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
+    "tests/unit/io/test_partition.py::test_max_size_partition[3-4-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
+    "tests/unit/io/test_partition.py::test_max_size_partition[3-5-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
+    "tests/unit/io/test_partition.py::test_max_size_partition[3-5-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
+    "tests/unit/io/test_partition.py::test_max_size_partition[3-6-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
+    "tests/unit/io/test_partition.py::test_max_size_partition[3-6-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
+    "tests/unit/io/test_partition.py::test_max_size_partition[3-7-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
+    "tests/unit/io/test_partition.py::test_max_size_partition[3-7-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
+    "tests/unit/io/test_partition.py::test_max_size_partition_lambda[io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
+    "tests/unit/io/test_partition.py::test_max_size_partition_lambda[io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
     "tests/unit/io/test_lazy_parquet.py::test_scan_parquet_ignores_dtype_mismatch_for_non_projected_columns_19249[False-False]": "Needs some variant of cudf#16394",
     "tests/unit/io/test_lazy_parquet.py::test_scan_parquet_ignores_dtype_mismatch_for_non_projected_columns_19249[True-False]": "Needs some variant of cudf#16394",
-    "tests/unit/io/test_lazy_parquet.py::test_parquet_slice_pushdown_non_zero_offset[False]": "Thrift data not handled correctly/slice pushdown wrong?",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read[False]": "Incomplete handling of projected reads with mismatching schemas, cudf#16394",
-    "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_dtype_mismatch[False]": "Different exception raised, but correctly raises an exception",
-    "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_missing_cols_from_first[False]": "Different exception raised, but correctly raises an exception",
     "tests/unit/io/test_parquet.py::test_read_parquet_only_loads_selected_columns_15098": "Memory usage won't be correct due to GPU",
     "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-none]": "Mismatching column read cudf#16394",
     "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-none]": "Mismatching column read cudf#16394",
@@ -84,48 +121,7 @@ EXPECTED_FAILURES: Mapping[str, str | tuple[str, bool]] = {
     "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-row_groups]": "Mismatching column read cudf#16394",
     "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-columns]": "Mismatching column read cudf#16394",
     "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-columns]": "Mismatching column read cudf#16394",
-    "tests/unit/io/test_scan.py::test_scan[single-csv-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_with_limit[single-csv-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_with_filter[single-csv-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[single-csv-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[single-csv-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[single-csv-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[single-csv-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[single-csv-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan[glob-csv-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_with_limit[glob-csv-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_with_filter[glob-csv-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[glob-csv-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[glob-csv-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[glob-csv-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[glob-csv-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[glob-csv-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan[glob-parquet-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_with_limit[glob-parquet-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_with_filter[glob-parquet-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[glob-parquet-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[glob-parquet-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[glob-parquet-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[glob-parquet-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[glob-parquet-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_with_row_index_projected_out[glob-parquet-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_with_row_index_filter_and_limit[glob-parquet-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan[single-parquet-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_with_limit[single-parquet-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_with_filter[single-parquet-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[single-parquet-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[single-parquet-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[single-parquet-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[single-parquet-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[single-parquet-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_with_row_index_projected_out[single-parquet-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_with_row_index_filter_and_limit[single-parquet-async]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_parquet-write_parquet]": "Need to add include_file_path to IR",
-    "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_csv-write_csv]": "Need to add include_file_path to IR",
-    "tests/unit/io/test_scan.py::test_scan_include_file_paths[False-scan_parquet-write_parquet]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_include_file_paths[False-scan_csv-write_csv]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_include_file_paths[False-scan_ndjson-write_ndjson]": "Debug output on stderr doesn't match",
-    "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_ndjson-write_ndjson]": "Need to add include_file_path to IR",
+    "tests/unit/io/test_parquet.py::test_scan_parquet_filter_statistics_load_missing_column_21391": "Mismatching column read cudf#16394",
     "tests/unit/io/test_write.py::test_write_async[read_parquet-write_parquet]": "Need to add include_file_path to IR",
     "tests/unit/io/test_write.py::test_write_async[<lambda>-write_csv]": "Need to add include_file_path to IR",
     "tests/unit/io/test_write.py::test_write_async[read_parquet-<lambda>]": "Need to add include_file_path to IR",
@@ -136,8 +132,6 @@ EXPECTED_FAILURES: Mapping[str, str | tuple[str, bool]] = {
     "tests/unit/lazyframe/test_lazyframe.py::test_round[dtype1-123.55-1-123.6]": "Rounding midpoints is handled incorrectly",
     "tests/unit/lazyframe/test_lazyframe.py::test_cast_frame": "Casting that raises not supported on GPU",
     "tests/unit/lazyframe/test_lazyframe.py::test_lazy_cache_hit": "Debug output on stderr doesn't match",
-    "tests/unit/operations/aggregation/test_aggregations.py::test_duration_function_literal": "Broadcasting inside groupby-agg not supported",
-    "tests/unit/operations/aggregation/test_aggregations.py::test_sum_empty_and_null_set": "libcudf sums column of all nulls to null, not zero",
     "tests/unit/operations/aggregation/test_aggregations.py::test_binary_op_agg_context_no_simplify_expr_12423": "groupby-agg of just literals should not produce collect_list",
     "tests/unit/operations/aggregation/test_aggregations.py::test_nan_inf_aggregation": "treatment of nans and nulls together is different in libcudf and polars in groupby-agg context",
     "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func0-none]": "cudf-polars doesn't nullify division by zero",
@@ -165,6 +159,7 @@ EXPECTED_FAILURES: Mapping[str, str | tuple[str, bool]] = {
     "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_both-none]": "cudf-polars doesn't nullify division by zero",
     "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_none-none]": "cudf-polars doesn't nullify division by zero",
     "tests/unit/operations/test_abs.py::test_abs_duration": "Need to raise for unsupported uops on timelike values",
+    "tests/unit/operations/test_group_by.py::test_group_by_shorthand_quantile": "libcudf quantiles are round to nearest ties to even, polars quantiles are round to nearest ties away from zero",
     "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input7-expected7-Float32-Float32]": "Mismatching dtypes, needs cudf#15852",
     "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype",
     "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input11-expected11-input_dtype11-output_dtype11]": "Unsupported groupby-agg for a particular dtype",
@@ -180,26 +175,30 @@ EXPECTED_FAILURES: Mapping[str, str | tuple[str, bool]] = {
     "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input16-expected16-input_dtype16-output_dtype16]": "Unsupported groupby-agg for a particular dtype",
     "tests/unit/operations/test_group_by.py::test_group_by_binary_agg_with_literal": "Incorrect broadcasting of literals in groupby-agg",
     "tests/unit/operations/test_group_by.py::test_group_by_lit_series": "Incorrect broadcasting of literals in groupby-agg",
-    "tests/unit/operations/test_group_by.py::test_aggregated_scalar_elementwise_15602": "Unsupported boolean function/dtype combination in groupby-agg",
-    "tests/unit/operations/test_group_by.py::test_schemas[data1-expr1-expected_select1-expected_gb1]": "Mismatching dtypes, needs cudf#15852",
     "tests/unit/operations/test_join.py::test_cross_join_slice_pushdown": "Need to implement slice pushdown for cross joins",
+    "tests/unit/operations/test_rolling.py::test_rolling_group_by_empty_groups_by_take_6330": "Ordering difference, might be polars bug",
     "tests/unit/sql/test_cast.py::test_cast_errors[values0-values::uint8-conversion from `f64` to `u64` failed]": "Casting that raises not supported on GPU",
     "tests/unit/sql/test_cast.py::test_cast_errors[values1-values::uint4-conversion from `i64` to `u32` failed]": "Casting that raises not supported on GPU",
     "tests/unit/sql/test_cast.py::test_cast_errors[values2-values::int1-conversion from `i64` to `i8` failed]": "Casting that raises not supported on GPU",
     "tests/unit/sql/test_cast.py::test_cast_errors[values5-values::int4-conversion from `str` to `i32` failed]": "Cast raises, but error user receives is wrong",
     "tests/unit/sql/test_miscellaneous.py::test_read_csv": "Incorrect handling of missing_is_null in read_csv",
     "tests/unit/sql/test_wildcard_opts.py::test_select_wildcard_errors": "Raises correctly but with different exception",
-    "tests/unit/streaming/test_streaming_io.py::test_parquet_eq_statistics": "Debug output on stderr doesn't match",
     "tests/unit/test_cse.py::test_cse_predicate_self_join": "Debug output on stderr doesn't match",
+    "tests/unit/test_cse.py::test_nested_cache_no_panic_16553": "Needs https://github.com/rapidsai/cudf/issues/18630",
     "tests/unit/test_empty.py::test_empty_9137": "Mismatching dtypes, needs cudf#15852",
     "tests/unit/test_errors.py::test_error_on_empty_group_by": "Incorrect exception raised",
-    # Maybe flaky, order-dependent?
-    "tests/unit/test_projections.py::test_schema_full_outer_join_projection_pd_13287": "Order-specific result check, query is correct but in different order",
-    "tests/unit/test_queries.py::test_group_by_agg_equals_zero_3535": "libcudf sums all nulls to null, not zero",
+    "tests/unit/streaming/test_streaming_io.py::test_sink_phases[parquet]": "Debug output on stderr doesn't match",
+    "tests/unit/streaming/test_streaming_io.py::test_sink_phases[ndjson]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_async_read_21945[scan_type0]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_async_read_21945[scan_type1]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_async_read_21945[scan_type2]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_async_read_21945[scan_type3]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_multiscan.py::test_multiscan_row_index[scan_csv-write_csv-csv]": "Debug output on stderr doesn't match",
 }
 TESTS_TO_SKIP: Mapping[str, str] = {
+    "tests/unit/operations/test_profile.py::test_profile_with_cse": "Shape assertion won't match",
     # On Ubuntu 20.04, the tzdata package contains a bunch of symlinks
     # for obsolete timezone names. However, the chrono_tz package that
     # polars uses doesn't read /usr/share/zoneinfo, instead packaging
@@ -209,15 +208,19 @@ TESTS_TO_SKIP: Mapping[str, str] = {
     # polars that the requested timezone is unknown.
     # Since this is random, just skip it, rather than xfailing.
     "tests/unit/lazyframe/test_serde.py::test_lf_serde_roundtrip_binary": "chrono_tz doesn't have all tzdata symlink names",
+    # Tests performance difference of CPU engine
+    "tests/unit/operations/test_join.py::test_join_where_eager_perf_21145": "Tests performance bug in CPU engine",
     # The test may segfault with the legacy streaming engine. We should
     # remove this skip when all polars tests use the new streaming engine.
     "tests/unit/streaming/test_streaming_group_by.py::test_streaming_group_by_literal[1]": "May segfault w/the legacy streaming engine",
     # Fails in CI, but passes locally
     "tests/unit/streaming/test_streaming.py::test_streaming_streamable_functions": "RuntimeError: polars_python::sql::PySQLContext is unsendable, but is being dropped on another thread",
-    # TODO: Remove once when we support polars 1.23
-    "tests/unit/io/database/test_read.py::test_read_database[uri: connectorx]": "ValueError: arrow2",
-    "tests/unit/io/database/test_read.py::test_read_database_cx_credentials[fakedb://123:456@account/database/schema?warehouse=warehouse&role=role]": "ValueError: arrow2",
-    "tests/unit/io/database/test_read.py::test_read_database_cx_credentials[fakedb://my#%us3r:p433w0rd@not_a_real_host:9999/database]": "ValueError: arrow2",
+    # Remove when polars supports Pydantic V3
+    "tests/unit/constructors/test_constructors.py::test_init_structured_objects": "pydantic deprecation warning",
+    "tests/unit/constructors/test_constructors.py::test_init_pydantic_2x": "pydantic deprecation warning",
+    "tests/unit/constructors/test_constructors.py::test_init_structured_objects_nested[_TestFooPD-_TestBarPD-_TestBazPD]": "pydantic deprecation warning",
+    "tests/unit/series/test_series.py::test_init_structured_objects": "pydantic deprecation warning",
+    "tests/unit/streaming/test_streaming.py::test_streaming_apply": "https://github.com/pola-rs/polars/issues/22558",
 }
@@ -229,18 +232,17 @@ def pytest_collection_modifyitems(
         # Don't xfail tests if running without fallback
         return
     for item in items:
-        if item.nodeid in TESTS_TO_SKIP:
-            item.add_marker(pytest.mark.skip(reason=TESTS_TO_SKIP[item.nodeid]))
-        elif item.nodeid in EXPECTED_FAILURES:
-            if isinstance(EXPECTED_FAILURES[item.nodeid], tuple):
+        if (reason := TESTS_TO_SKIP.get(item.nodeid, None)) is not None:
+            item.add_marker(pytest.mark.skip(reason=reason))
+        elif (entry := EXPECTED_FAILURES.get(item.nodeid, None)) is not None:
+            if isinstance(entry, tuple):
                 # the second entry in the tuple is the condition to xfail on
+                reason, condition = entry
                 item.add_marker(
                     pytest.mark.xfail(
-                        condition=EXPECTED_FAILURES[item.nodeid][1],
-                        reason=EXPECTED_FAILURES[item.nodeid][0],
+                        condition=condition,
+                        reason=reason,
                     ),
                 )
             else:
-                item.add_marker(
-                    pytest.mark.xfail(reason=EXPECTED_FAILURES[item.nodeid])
-                )
+                item.add_marker(pytest.mark.xfail(reason=entry))

cudf_polars/typing/__init__.py CHANGED Viewed

@@ -1,26 +1,40 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 """Typing utilities for cudf_polars."""
 from __future__ import annotations
-from collections.abc import Hashable, Mapping
-from typing import TYPE_CHECKING, Any, Literal, Protocol, TypeVar, Union
+from collections.abc import Hashable, MutableMapping
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Literal,
+    NewType,
+    Protocol,
+    TypeVar,
+    TypedDict,
+    Union,
+)
 from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
 import pylibcudf as plc
 if TYPE_CHECKING:
-    from collections.abc import Callable
+    from collections.abc import Callable, Mapping
     from typing import TypeAlias
     import polars as pl
+    from cudf_polars.containers import DataFrame
     from cudf_polars.dsl import expr, ir, nodebase
 __all__: list[str] = [
+    "ClosedInterval",
+    "ColumnHeader",
+    "ColumnOptions",
+    "DataFrameHeader",
     "ExprTransformer",
     "GenericTransformer",
     "IRTransformer",
@@ -28,6 +42,8 @@ __all__: list[str] = [
     "OptimizationArgs",
     "PolarsExpr",
     "PolarsIR",
+    "Schema",
+    "Slice",
 ]
 PolarsIR: TypeAlias = Union[
@@ -66,7 +82,15 @@ PolarsExpr: TypeAlias = Union[
     pl_expr.PyExprIR,
 ]
-Schema: TypeAlias = Mapping[str, plc.DataType]
+Schema: TypeAlias = dict[str, plc.DataType]
+Slice: TypeAlias = tuple[int, int | None]
+CSECache: TypeAlias = MutableMapping[int, tuple["DataFrame", int]]
+ClosedInterval: TypeAlias = Literal["left", "right", "both", "none"]
+Duration = NewType("Duration", tuple[int, int, int, int, bool, bool])
 class NodeTraverser(Protocol):
@@ -84,7 +108,7 @@ class NodeTraverser(Protocol):
         """Convert current plan node to python rep."""
         ...
-    def get_schema(self) -> Mapping[str, pl.DataType]:
+    def get_schema(self) -> Schema:
         """Get the schema of the current plan node."""
         ...
@@ -145,3 +169,32 @@ ExprTransformer: TypeAlias = GenericTransformer["expr.Expr", "expr.Expr"]
 IRTransformer: TypeAlias = GenericTransformer["ir.IR", "ir.IR"]
 """Protocol for transformation of IR nodes."""
+class ColumnOptions(TypedDict):
+    """
+    Column constructor options.
+    Notes
+    -----
+    Used to serialize Column and DataFrame containers.
+    """
+    is_sorted: plc.types.Sorted
+    order: plc.types.Order
+    null_order: plc.types.NullOrder
+    name: str | None
+class ColumnHeader(TypedDict):
+    """Column serialization header."""
+    column_kwargs: ColumnOptions
+    frame_count: int
+class DataFrameHeader(TypedDict):
+    """DataFrame serialization header."""
+    columns_kwargs: list[ColumnOptions]
+    frame_count: int

cudf-polars-cu12 25.2.2__py3-none-any.whl → 25.6.0__py3-none-any.whl

cudf-polars-cu12 25.2.2py3-none-any.whl → 25.6.0py3-none-any.whl