cudf-polars-cu12 25.2.2__py3-none-any.whl → 25.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. cudf_polars/VERSION +1 -1
  2. cudf_polars/callback.py +82 -65
  3. cudf_polars/containers/column.py +138 -7
  4. cudf_polars/containers/dataframe.py +26 -39
  5. cudf_polars/dsl/expr.py +3 -1
  6. cudf_polars/dsl/expressions/aggregation.py +27 -63
  7. cudf_polars/dsl/expressions/base.py +40 -72
  8. cudf_polars/dsl/expressions/binaryop.py +5 -41
  9. cudf_polars/dsl/expressions/boolean.py +25 -53
  10. cudf_polars/dsl/expressions/datetime.py +97 -17
  11. cudf_polars/dsl/expressions/literal.py +27 -33
  12. cudf_polars/dsl/expressions/rolling.py +110 -9
  13. cudf_polars/dsl/expressions/selection.py +8 -26
  14. cudf_polars/dsl/expressions/slicing.py +47 -0
  15. cudf_polars/dsl/expressions/sorting.py +5 -18
  16. cudf_polars/dsl/expressions/string.py +33 -36
  17. cudf_polars/dsl/expressions/ternary.py +3 -10
  18. cudf_polars/dsl/expressions/unary.py +35 -75
  19. cudf_polars/dsl/ir.py +749 -212
  20. cudf_polars/dsl/nodebase.py +8 -1
  21. cudf_polars/dsl/to_ast.py +5 -3
  22. cudf_polars/dsl/translate.py +319 -171
  23. cudf_polars/dsl/utils/__init__.py +8 -0
  24. cudf_polars/dsl/utils/aggregations.py +292 -0
  25. cudf_polars/dsl/utils/groupby.py +97 -0
  26. cudf_polars/dsl/utils/naming.py +34 -0
  27. cudf_polars/dsl/utils/replace.py +46 -0
  28. cudf_polars/dsl/utils/rolling.py +113 -0
  29. cudf_polars/dsl/utils/windows.py +186 -0
  30. cudf_polars/experimental/base.py +17 -19
  31. cudf_polars/experimental/benchmarks/__init__.py +4 -0
  32. cudf_polars/experimental/benchmarks/pdsh.py +1279 -0
  33. cudf_polars/experimental/dask_registers.py +196 -0
  34. cudf_polars/experimental/distinct.py +174 -0
  35. cudf_polars/experimental/explain.py +127 -0
  36. cudf_polars/experimental/expressions.py +521 -0
  37. cudf_polars/experimental/groupby.py +288 -0
  38. cudf_polars/experimental/io.py +58 -29
  39. cudf_polars/experimental/join.py +353 -0
  40. cudf_polars/experimental/parallel.py +166 -93
  41. cudf_polars/experimental/repartition.py +69 -0
  42. cudf_polars/experimental/scheduler.py +155 -0
  43. cudf_polars/experimental/select.py +92 -7
  44. cudf_polars/experimental/shuffle.py +294 -0
  45. cudf_polars/experimental/sort.py +45 -0
  46. cudf_polars/experimental/spilling.py +151 -0
  47. cudf_polars/experimental/utils.py +100 -0
  48. cudf_polars/testing/asserts.py +146 -6
  49. cudf_polars/testing/io.py +72 -0
  50. cudf_polars/testing/plugin.py +78 -76
  51. cudf_polars/typing/__init__.py +59 -6
  52. cudf_polars/utils/config.py +353 -0
  53. cudf_polars/utils/conversion.py +40 -0
  54. cudf_polars/utils/dtypes.py +22 -5
  55. cudf_polars/utils/timer.py +39 -0
  56. cudf_polars/utils/versions.py +5 -4
  57. {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/METADATA +10 -7
  58. cudf_polars_cu12-25.6.0.dist-info/RECORD +73 -0
  59. {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/WHEEL +1 -1
  60. cudf_polars/experimental/dask_serialize.py +0 -59
  61. cudf_polars_cu12-25.2.2.dist-info/RECORD +0 -48
  62. {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info/licenses}/LICENSE +0 -0
  63. {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/top_level.txt +0 -0
@@ -31,9 +31,12 @@ def pytest_addoption(parser: pytest.Parser) -> None:
31
31
  def pytest_configure(config: pytest.Config) -> None:
32
32
  """Enable use of this module as a pytest plugin to enable GPU collection."""
33
33
  no_fallback = config.getoption("--cudf-polars-no-fallback")
34
- collect = polars.LazyFrame.collect
35
- engine = polars.GPUEngine(raise_on_fail=no_fallback)
36
- polars.LazyFrame.collect = partialmethod(collect, engine=engine)
34
+ if no_fallback:
35
+ collect = polars.LazyFrame.collect
36
+ engine = polars.GPUEngine(raise_on_fail=no_fallback)
37
+ polars.LazyFrame.collect = partialmethod(collect, engine=engine)
38
+ else:
39
+ polars.Config.set_engine_affinity("gpu")
37
40
  config.addinivalue_line(
38
41
  "filterwarnings",
39
42
  "ignore:.*GPU engine does not support streaming or background collection",
@@ -51,22 +54,56 @@ EXPECTED_FAILURES: Mapping[str, str | tuple[str, bool]] = {
51
54
  "tests/unit/io/test_delta.py::test_scan_delta_relative": "Need to expose hive partitioning",
52
55
  "tests/unit/io/test_delta.py::test_read_delta_version": "Need to expose hive partitioning",
53
56
  "tests/unit/io/test_lazy_count_star.py::test_count_compressed_csv_18057": "Need to determine if file is compressed",
57
+ "tests/unit/io/test_lazy_count_star.py::test_count_csv[foods1.csv-27]": "Need fast count for CSV scan",
58
+ "tests/unit/io/test_lazy_count_star.py::test_count_csv[foods*.csv-135]": "Need fast count for CSV scan",
59
+ "tests/unit/io/test_lazy_count_star.py::test_count_parquet[small.parquet-4]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
60
+ "tests/unit/io/test_lazy_count_star.py::test_count_parquet[foods*.parquet-54]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
61
+ "tests/unit/io/test_lazy_count_star.py::test_commented_csv": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
62
+ "tests/unit/io/test_lazy_count_star.py::test_count_ndjson[foods1.ndjson-27]": "Need fast count for JSON scan",
63
+ "tests/unit/io/test_lazy_count_star.py::test_count_ndjson[foods*.ndjson-54]": "Need fast count for JSON scan",
64
+ "tests/unit/io/test_lazy_count_star.py::test_count_compressed_ndjson": "Need fast count for JSON scan",
54
65
  "tests/unit/io/test_lazy_csv.py::test_scan_csv_slice_offset_zero": "Integer overflow in sliced read",
55
- "tests/unit/io/test_lazy_parquet.py::test_dsl2ir_cached_metadata[False]": "cudf-polars doesn't use metadata read by rust preprocessing",
56
66
  "tests/unit/io/test_lazy_parquet.py::test_parquet_is_in_statistics": "Debug output on stderr doesn't match",
57
67
  "tests/unit/io/test_lazy_parquet.py::test_parquet_statistics": "Debug output on stderr doesn't match",
58
- "tests/unit/io/test_lazy_parquet.py::test_parquet_different_schema[False]": "Needs cudf#16394",
59
- "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-columns]": "Correctly raises but different error",
60
- "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-row_groups]": "Correctly raises but different error",
61
- "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-prefiltered]": "Correctly raises but different error",
62
- "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-none]": "Correctly raises but different error",
63
- "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_mismatch_panic_17067[False]": "Needs cudf#16394",
68
+ "tests/unit/io/test_partition.py::test_partition_to_memory[io_type0]": "partition sinks not yet supported in standard engine.",
69
+ "tests/unit/io/test_partition.py::test_partition_to_memory[io_type1]": "partition sinks not yet supported in standard engine.",
70
+ "tests/unit/io/test_partition.py::test_partition_to_memory[io_type2]": "partition sinks not yet supported in standard engine.",
71
+ "tests/unit/io/test_partition.py::test_partition_to_memory[io_type3]": "partition sinks not yet supported in standard engine.",
72
+ "tests/unit/io/test_partition.py::test_max_size_partition[1-1-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
73
+ "tests/unit/io/test_partition.py::test_max_size_partition[1-1-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
74
+ "tests/unit/io/test_partition.py::test_max_size_partition[1-4-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
75
+ "tests/unit/io/test_partition.py::test_max_size_partition[1-4-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
76
+ "tests/unit/io/test_partition.py::test_max_size_partition[1-5-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
77
+ "tests/unit/io/test_partition.py::test_max_size_partition[1-5-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
78
+ "tests/unit/io/test_partition.py::test_max_size_partition[1-6-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
79
+ "tests/unit/io/test_partition.py::test_max_size_partition[1-6-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
80
+ "tests/unit/io/test_partition.py::test_max_size_partition[1-7-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
81
+ "tests/unit/io/test_partition.py::test_max_size_partition[1-7-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
82
+ "tests/unit/io/test_partition.py::test_max_size_partition[2-1-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
83
+ "tests/unit/io/test_partition.py::test_max_size_partition[2-1-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
84
+ "tests/unit/io/test_partition.py::test_max_size_partition[2-4-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
85
+ "tests/unit/io/test_partition.py::test_max_size_partition[2-4-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
86
+ "tests/unit/io/test_partition.py::test_max_size_partition[2-5-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
87
+ "tests/unit/io/test_partition.py::test_max_size_partition[2-5-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
88
+ "tests/unit/io/test_partition.py::test_max_size_partition[2-6-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
89
+ "tests/unit/io/test_partition.py::test_max_size_partition[2-6-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
90
+ "tests/unit/io/test_partition.py::test_max_size_partition[2-7-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
91
+ "tests/unit/io/test_partition.py::test_max_size_partition[2-7-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
92
+ "tests/unit/io/test_partition.py::test_max_size_partition[3-1-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
93
+ "tests/unit/io/test_partition.py::test_max_size_partition[3-1-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
94
+ "tests/unit/io/test_partition.py::test_max_size_partition[3-4-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
95
+ "tests/unit/io/test_partition.py::test_max_size_partition[3-4-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
96
+ "tests/unit/io/test_partition.py::test_max_size_partition[3-5-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
97
+ "tests/unit/io/test_partition.py::test_max_size_partition[3-5-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
98
+ "tests/unit/io/test_partition.py::test_max_size_partition[3-6-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
99
+ "tests/unit/io/test_partition.py::test_max_size_partition[3-6-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
100
+ "tests/unit/io/test_partition.py::test_max_size_partition[3-7-io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
101
+ "tests/unit/io/test_partition.py::test_max_size_partition[3-7-io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
102
+ "tests/unit/io/test_partition.py::test_max_size_partition_lambda[io_type1]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
103
+ "tests/unit/io/test_partition.py::test_max_size_partition_lambda[io_type2]": "Need fast count for Parquet scan: https://github.com/rapidsai/cudf/pull/18463",
64
104
  "tests/unit/io/test_lazy_parquet.py::test_scan_parquet_ignores_dtype_mismatch_for_non_projected_columns_19249[False-False]": "Needs some variant of cudf#16394",
65
105
  "tests/unit/io/test_lazy_parquet.py::test_scan_parquet_ignores_dtype_mismatch_for_non_projected_columns_19249[True-False]": "Needs some variant of cudf#16394",
66
- "tests/unit/io/test_lazy_parquet.py::test_parquet_slice_pushdown_non_zero_offset[False]": "Thrift data not handled correctly/slice pushdown wrong?",
67
106
  "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read[False]": "Incomplete handling of projected reads with mismatching schemas, cudf#16394",
68
- "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_dtype_mismatch[False]": "Different exception raised, but correctly raises an exception",
69
- "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_missing_cols_from_first[False]": "Different exception raised, but correctly raises an exception",
70
107
  "tests/unit/io/test_parquet.py::test_read_parquet_only_loads_selected_columns_15098": "Memory usage won't be correct due to GPU",
71
108
  "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-none]": "Mismatching column read cudf#16394",
72
109
  "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-none]": "Mismatching column read cudf#16394",
@@ -84,48 +121,7 @@ EXPECTED_FAILURES: Mapping[str, str | tuple[str, bool]] = {
84
121
  "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-row_groups]": "Mismatching column read cudf#16394",
85
122
  "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-columns]": "Mismatching column read cudf#16394",
86
123
  "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-columns]": "Mismatching column read cudf#16394",
87
- "tests/unit/io/test_scan.py::test_scan[single-csv-async]": "Debug output on stderr doesn't match",
88
- "tests/unit/io/test_scan.py::test_scan_with_limit[single-csv-async]": "Debug output on stderr doesn't match",
89
- "tests/unit/io/test_scan.py::test_scan_with_filter[single-csv-async]": "Debug output on stderr doesn't match",
90
- "tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[single-csv-async]": "Debug output on stderr doesn't match",
91
- "tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[single-csv-async]": "Debug output on stderr doesn't match",
92
- "tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[single-csv-async]": "Debug output on stderr doesn't match",
93
- "tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[single-csv-async]": "Debug output on stderr doesn't match",
94
- "tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[single-csv-async]": "Debug output on stderr doesn't match",
95
- "tests/unit/io/test_scan.py::test_scan[glob-csv-async]": "Debug output on stderr doesn't match",
96
- "tests/unit/io/test_scan.py::test_scan_with_limit[glob-csv-async]": "Debug output on stderr doesn't match",
97
- "tests/unit/io/test_scan.py::test_scan_with_filter[glob-csv-async]": "Debug output on stderr doesn't match",
98
- "tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[glob-csv-async]": "Debug output on stderr doesn't match",
99
- "tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[glob-csv-async]": "Debug output on stderr doesn't match",
100
- "tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[glob-csv-async]": "Debug output on stderr doesn't match",
101
- "tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[glob-csv-async]": "Debug output on stderr doesn't match",
102
- "tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[glob-csv-async]": "Debug output on stderr doesn't match",
103
- "tests/unit/io/test_scan.py::test_scan[glob-parquet-async]": "Debug output on stderr doesn't match",
104
- "tests/unit/io/test_scan.py::test_scan_with_limit[glob-parquet-async]": "Debug output on stderr doesn't match",
105
- "tests/unit/io/test_scan.py::test_scan_with_filter[glob-parquet-async]": "Debug output on stderr doesn't match",
106
- "tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[glob-parquet-async]": "Debug output on stderr doesn't match",
107
- "tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[glob-parquet-async]": "Debug output on stderr doesn't match",
108
- "tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[glob-parquet-async]": "Debug output on stderr doesn't match",
109
- "tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[glob-parquet-async]": "Debug output on stderr doesn't match",
110
- "tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[glob-parquet-async]": "Debug output on stderr doesn't match",
111
- "tests/unit/io/test_scan.py::test_scan_with_row_index_projected_out[glob-parquet-async]": "Debug output on stderr doesn't match",
112
- "tests/unit/io/test_scan.py::test_scan_with_row_index_filter_and_limit[glob-parquet-async]": "Debug output on stderr doesn't match",
113
- "tests/unit/io/test_scan.py::test_scan[single-parquet-async]": "Debug output on stderr doesn't match",
114
- "tests/unit/io/test_scan.py::test_scan_with_limit[single-parquet-async]": "Debug output on stderr doesn't match",
115
- "tests/unit/io/test_scan.py::test_scan_with_filter[single-parquet-async]": "Debug output on stderr doesn't match",
116
- "tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[single-parquet-async]": "Debug output on stderr doesn't match",
117
- "tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[single-parquet-async]": "Debug output on stderr doesn't match",
118
- "tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[single-parquet-async]": "Debug output on stderr doesn't match",
119
- "tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[single-parquet-async]": "Debug output on stderr doesn't match",
120
- "tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[single-parquet-async]": "Debug output on stderr doesn't match",
121
- "tests/unit/io/test_scan.py::test_scan_with_row_index_projected_out[single-parquet-async]": "Debug output on stderr doesn't match",
122
- "tests/unit/io/test_scan.py::test_scan_with_row_index_filter_and_limit[single-parquet-async]": "Debug output on stderr doesn't match",
123
- "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_parquet-write_parquet]": "Need to add include_file_path to IR",
124
- "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_csv-write_csv]": "Need to add include_file_path to IR",
125
- "tests/unit/io/test_scan.py::test_scan_include_file_paths[False-scan_parquet-write_parquet]": "Debug output on stderr doesn't match",
126
- "tests/unit/io/test_scan.py::test_scan_include_file_paths[False-scan_csv-write_csv]": "Debug output on stderr doesn't match",
127
- "tests/unit/io/test_scan.py::test_scan_include_file_paths[False-scan_ndjson-write_ndjson]": "Debug output on stderr doesn't match",
128
- "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_ndjson-write_ndjson]": "Need to add include_file_path to IR",
124
+ "tests/unit/io/test_parquet.py::test_scan_parquet_filter_statistics_load_missing_column_21391": "Mismatching column read cudf#16394",
129
125
  "tests/unit/io/test_write.py::test_write_async[read_parquet-write_parquet]": "Need to add include_file_path to IR",
130
126
  "tests/unit/io/test_write.py::test_write_async[<lambda>-write_csv]": "Need to add include_file_path to IR",
131
127
  "tests/unit/io/test_write.py::test_write_async[read_parquet-<lambda>]": "Need to add include_file_path to IR",
@@ -136,8 +132,6 @@ EXPECTED_FAILURES: Mapping[str, str | tuple[str, bool]] = {
136
132
  "tests/unit/lazyframe/test_lazyframe.py::test_round[dtype1-123.55-1-123.6]": "Rounding midpoints is handled incorrectly",
137
133
  "tests/unit/lazyframe/test_lazyframe.py::test_cast_frame": "Casting that raises not supported on GPU",
138
134
  "tests/unit/lazyframe/test_lazyframe.py::test_lazy_cache_hit": "Debug output on stderr doesn't match",
139
- "tests/unit/operations/aggregation/test_aggregations.py::test_duration_function_literal": "Broadcasting inside groupby-agg not supported",
140
- "tests/unit/operations/aggregation/test_aggregations.py::test_sum_empty_and_null_set": "libcudf sums column of all nulls to null, not zero",
141
135
  "tests/unit/operations/aggregation/test_aggregations.py::test_binary_op_agg_context_no_simplify_expr_12423": "groupby-agg of just literals should not produce collect_list",
142
136
  "tests/unit/operations/aggregation/test_aggregations.py::test_nan_inf_aggregation": "treatment of nans and nulls together is different in libcudf and polars in groupby-agg context",
143
137
  "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func0-none]": "cudf-polars doesn't nullify division by zero",
@@ -165,6 +159,7 @@ EXPECTED_FAILURES: Mapping[str, str | tuple[str, bool]] = {
165
159
  "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_both-none]": "cudf-polars doesn't nullify division by zero",
166
160
  "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_none-none]": "cudf-polars doesn't nullify division by zero",
167
161
  "tests/unit/operations/test_abs.py::test_abs_duration": "Need to raise for unsupported uops on timelike values",
162
+ "tests/unit/operations/test_group_by.py::test_group_by_shorthand_quantile": "libcudf quantiles are round to nearest ties to even, polars quantiles are round to nearest ties away from zero",
168
163
  "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input7-expected7-Float32-Float32]": "Mismatching dtypes, needs cudf#15852",
169
164
  "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype",
170
165
  "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input11-expected11-input_dtype11-output_dtype11]": "Unsupported groupby-agg for a particular dtype",
@@ -180,26 +175,30 @@ EXPECTED_FAILURES: Mapping[str, str | tuple[str, bool]] = {
180
175
  "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input16-expected16-input_dtype16-output_dtype16]": "Unsupported groupby-agg for a particular dtype",
181
176
  "tests/unit/operations/test_group_by.py::test_group_by_binary_agg_with_literal": "Incorrect broadcasting of literals in groupby-agg",
182
177
  "tests/unit/operations/test_group_by.py::test_group_by_lit_series": "Incorrect broadcasting of literals in groupby-agg",
183
- "tests/unit/operations/test_group_by.py::test_aggregated_scalar_elementwise_15602": "Unsupported boolean function/dtype combination in groupby-agg",
184
- "tests/unit/operations/test_group_by.py::test_schemas[data1-expr1-expected_select1-expected_gb1]": "Mismatching dtypes, needs cudf#15852",
185
178
  "tests/unit/operations/test_join.py::test_cross_join_slice_pushdown": "Need to implement slice pushdown for cross joins",
179
+ "tests/unit/operations/test_rolling.py::test_rolling_group_by_empty_groups_by_take_6330": "Ordering difference, might be polars bug",
186
180
  "tests/unit/sql/test_cast.py::test_cast_errors[values0-values::uint8-conversion from `f64` to `u64` failed]": "Casting that raises not supported on GPU",
187
181
  "tests/unit/sql/test_cast.py::test_cast_errors[values1-values::uint4-conversion from `i64` to `u32` failed]": "Casting that raises not supported on GPU",
188
182
  "tests/unit/sql/test_cast.py::test_cast_errors[values2-values::int1-conversion from `i64` to `i8` failed]": "Casting that raises not supported on GPU",
189
183
  "tests/unit/sql/test_cast.py::test_cast_errors[values5-values::int4-conversion from `str` to `i32` failed]": "Cast raises, but error user receives is wrong",
190
184
  "tests/unit/sql/test_miscellaneous.py::test_read_csv": "Incorrect handling of missing_is_null in read_csv",
191
185
  "tests/unit/sql/test_wildcard_opts.py::test_select_wildcard_errors": "Raises correctly but with different exception",
192
- "tests/unit/streaming/test_streaming_io.py::test_parquet_eq_statistics": "Debug output on stderr doesn't match",
193
186
  "tests/unit/test_cse.py::test_cse_predicate_self_join": "Debug output on stderr doesn't match",
187
+ "tests/unit/test_cse.py::test_nested_cache_no_panic_16553": "Needs https://github.com/rapidsai/cudf/issues/18630",
194
188
  "tests/unit/test_empty.py::test_empty_9137": "Mismatching dtypes, needs cudf#15852",
195
189
  "tests/unit/test_errors.py::test_error_on_empty_group_by": "Incorrect exception raised",
196
- # Maybe flaky, order-dependent?
197
- "tests/unit/test_projections.py::test_schema_full_outer_join_projection_pd_13287": "Order-specific result check, query is correct but in different order",
198
- "tests/unit/test_queries.py::test_group_by_agg_equals_zero_3535": "libcudf sums all nulls to null, not zero",
190
+ "tests/unit/streaming/test_streaming_io.py::test_sink_phases[parquet]": "Debug output on stderr doesn't match",
191
+ "tests/unit/streaming/test_streaming_io.py::test_sink_phases[ndjson]": "Debug output on stderr doesn't match",
192
+ "tests/unit/io/test_scan.py::test_async_read_21945[scan_type0]": "Debug output on stderr doesn't match",
193
+ "tests/unit/io/test_scan.py::test_async_read_21945[scan_type1]": "Debug output on stderr doesn't match",
194
+ "tests/unit/io/test_scan.py::test_async_read_21945[scan_type2]": "Debug output on stderr doesn't match",
195
+ "tests/unit/io/test_scan.py::test_async_read_21945[scan_type3]": "Debug output on stderr doesn't match",
196
+ "tests/unit/io/test_multiscan.py::test_multiscan_row_index[scan_csv-write_csv-csv]": "Debug output on stderr doesn't match",
199
197
  }
200
198
 
201
199
 
202
200
  TESTS_TO_SKIP: Mapping[str, str] = {
201
+ "tests/unit/operations/test_profile.py::test_profile_with_cse": "Shape assertion won't match",
203
202
  # On Ubuntu 20.04, the tzdata package contains a bunch of symlinks
204
203
  # for obsolete timezone names. However, the chrono_tz package that
205
204
  # polars uses doesn't read /usr/share/zoneinfo, instead packaging
@@ -209,15 +208,19 @@ TESTS_TO_SKIP: Mapping[str, str] = {
209
208
  # polars that the requested timezone is unknown.
210
209
  # Since this is random, just skip it, rather than xfailing.
211
210
  "tests/unit/lazyframe/test_serde.py::test_lf_serde_roundtrip_binary": "chrono_tz doesn't have all tzdata symlink names",
211
+ # Tests performance difference of CPU engine
212
+ "tests/unit/operations/test_join.py::test_join_where_eager_perf_21145": "Tests performance bug in CPU engine",
212
213
  # The test may segfault with the legacy streaming engine. We should
213
214
  # remove this skip when all polars tests use the new streaming engine.
214
215
  "tests/unit/streaming/test_streaming_group_by.py::test_streaming_group_by_literal[1]": "May segfault w/the legacy streaming engine",
215
216
  # Fails in CI, but passes locally
216
217
  "tests/unit/streaming/test_streaming.py::test_streaming_streamable_functions": "RuntimeError: polars_python::sql::PySQLContext is unsendable, but is being dropped on another thread",
217
- # TODO: Remove once when we support polars 1.23
218
- "tests/unit/io/database/test_read.py::test_read_database[uri: connectorx]": "ValueError: arrow2",
219
- "tests/unit/io/database/test_read.py::test_read_database_cx_credentials[fakedb://123:456@account/database/schema?warehouse=warehouse&role=role]": "ValueError: arrow2",
220
- "tests/unit/io/database/test_read.py::test_read_database_cx_credentials[fakedb://my#%us3r:p433w0rd@not_a_real_host:9999/database]": "ValueError: arrow2",
218
+ # Remove when polars supports Pydantic V3
219
+ "tests/unit/constructors/test_constructors.py::test_init_structured_objects": "pydantic deprecation warning",
220
+ "tests/unit/constructors/test_constructors.py::test_init_pydantic_2x": "pydantic deprecation warning",
221
+ "tests/unit/constructors/test_constructors.py::test_init_structured_objects_nested[_TestFooPD-_TestBarPD-_TestBazPD]": "pydantic deprecation warning",
222
+ "tests/unit/series/test_series.py::test_init_structured_objects": "pydantic deprecation warning",
223
+ "tests/unit/streaming/test_streaming.py::test_streaming_apply": "https://github.com/pola-rs/polars/issues/22558",
221
224
  }
222
225
 
223
226
 
@@ -229,18 +232,17 @@ def pytest_collection_modifyitems(
229
232
  # Don't xfail tests if running without fallback
230
233
  return
231
234
  for item in items:
232
- if item.nodeid in TESTS_TO_SKIP:
233
- item.add_marker(pytest.mark.skip(reason=TESTS_TO_SKIP[item.nodeid]))
234
- elif item.nodeid in EXPECTED_FAILURES:
235
- if isinstance(EXPECTED_FAILURES[item.nodeid], tuple):
235
+ if (reason := TESTS_TO_SKIP.get(item.nodeid, None)) is not None:
236
+ item.add_marker(pytest.mark.skip(reason=reason))
237
+ elif (entry := EXPECTED_FAILURES.get(item.nodeid, None)) is not None:
238
+ if isinstance(entry, tuple):
236
239
  # the second entry in the tuple is the condition to xfail on
240
+ reason, condition = entry
237
241
  item.add_marker(
238
242
  pytest.mark.xfail(
239
- condition=EXPECTED_FAILURES[item.nodeid][1],
240
- reason=EXPECTED_FAILURES[item.nodeid][0],
243
+ condition=condition,
244
+ reason=reason,
241
245
  ),
242
246
  )
243
247
  else:
244
- item.add_marker(
245
- pytest.mark.xfail(reason=EXPECTED_FAILURES[item.nodeid])
246
- )
248
+ item.add_marker(pytest.mark.xfail(reason=entry))
@@ -1,26 +1,40 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
4
  """Typing utilities for cudf_polars."""
5
5
 
6
6
  from __future__ import annotations
7
7
 
8
- from collections.abc import Hashable, Mapping
9
- from typing import TYPE_CHECKING, Any, Literal, Protocol, TypeVar, Union
8
+ from collections.abc import Hashable, MutableMapping
9
+ from typing import (
10
+ TYPE_CHECKING,
11
+ Any,
12
+ Literal,
13
+ NewType,
14
+ Protocol,
15
+ TypeVar,
16
+ TypedDict,
17
+ Union,
18
+ )
10
19
 
11
20
  from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
12
21
 
13
22
  import pylibcudf as plc
14
23
 
15
24
  if TYPE_CHECKING:
16
- from collections.abc import Callable
25
+ from collections.abc import Callable, Mapping
17
26
  from typing import TypeAlias
18
27
 
19
28
  import polars as pl
20
29
 
30
+ from cudf_polars.containers import DataFrame
21
31
  from cudf_polars.dsl import expr, ir, nodebase
22
32
 
23
33
  __all__: list[str] = [
34
+ "ClosedInterval",
35
+ "ColumnHeader",
36
+ "ColumnOptions",
37
+ "DataFrameHeader",
24
38
  "ExprTransformer",
25
39
  "GenericTransformer",
26
40
  "IRTransformer",
@@ -28,6 +42,8 @@ __all__: list[str] = [
28
42
  "OptimizationArgs",
29
43
  "PolarsExpr",
30
44
  "PolarsIR",
45
+ "Schema",
46
+ "Slice",
31
47
  ]
32
48
 
33
49
  PolarsIR: TypeAlias = Union[
@@ -66,7 +82,15 @@ PolarsExpr: TypeAlias = Union[
66
82
  pl_expr.PyExprIR,
67
83
  ]
68
84
 
69
- Schema: TypeAlias = Mapping[str, plc.DataType]
85
+ Schema: TypeAlias = dict[str, plc.DataType]
86
+
87
+ Slice: TypeAlias = tuple[int, int | None]
88
+
89
+ CSECache: TypeAlias = MutableMapping[int, tuple["DataFrame", int]]
90
+
91
+ ClosedInterval: TypeAlias = Literal["left", "right", "both", "none"]
92
+
93
+ Duration = NewType("Duration", tuple[int, int, int, int, bool, bool])
70
94
 
71
95
 
72
96
  class NodeTraverser(Protocol):
@@ -84,7 +108,7 @@ class NodeTraverser(Protocol):
84
108
  """Convert current plan node to python rep."""
85
109
  ...
86
110
 
87
- def get_schema(self) -> Mapping[str, pl.DataType]:
111
+ def get_schema(self) -> Schema:
88
112
  """Get the schema of the current plan node."""
89
113
  ...
90
114
 
@@ -145,3 +169,32 @@ ExprTransformer: TypeAlias = GenericTransformer["expr.Expr", "expr.Expr"]
145
169
 
146
170
  IRTransformer: TypeAlias = GenericTransformer["ir.IR", "ir.IR"]
147
171
  """Protocol for transformation of IR nodes."""
172
+
173
+
174
+ class ColumnOptions(TypedDict):
175
+ """
176
+ Column constructor options.
177
+
178
+ Notes
179
+ -----
180
+ Used to serialize Column and DataFrame containers.
181
+ """
182
+
183
+ is_sorted: plc.types.Sorted
184
+ order: plc.types.Order
185
+ null_order: plc.types.NullOrder
186
+ name: str | None
187
+
188
+
189
+ class ColumnHeader(TypedDict):
190
+ """Column serialization header."""
191
+
192
+ column_kwargs: ColumnOptions
193
+ frame_count: int
194
+
195
+
196
+ class DataFrameHeader(TypedDict):
197
+ """DataFrame serialization header."""
198
+
199
+ columns_kwargs: list[ColumnOptions]
200
+ frame_count: int