cudf-polars-cu13 25.10.0__py3-none-any.whl → 26.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. cudf_polars/GIT_COMMIT +1 -1
  2. cudf_polars/VERSION +1 -1
  3. cudf_polars/callback.py +60 -15
  4. cudf_polars/containers/column.py +137 -77
  5. cudf_polars/containers/dataframe.py +123 -34
  6. cudf_polars/containers/datatype.py +134 -13
  7. cudf_polars/dsl/expr.py +0 -2
  8. cudf_polars/dsl/expressions/aggregation.py +80 -28
  9. cudf_polars/dsl/expressions/binaryop.py +34 -14
  10. cudf_polars/dsl/expressions/boolean.py +110 -37
  11. cudf_polars/dsl/expressions/datetime.py +59 -30
  12. cudf_polars/dsl/expressions/literal.py +11 -5
  13. cudf_polars/dsl/expressions/rolling.py +460 -119
  14. cudf_polars/dsl/expressions/selection.py +9 -8
  15. cudf_polars/dsl/expressions/slicing.py +1 -1
  16. cudf_polars/dsl/expressions/string.py +256 -114
  17. cudf_polars/dsl/expressions/struct.py +19 -7
  18. cudf_polars/dsl/expressions/ternary.py +33 -3
  19. cudf_polars/dsl/expressions/unary.py +126 -64
  20. cudf_polars/dsl/ir.py +1053 -350
  21. cudf_polars/dsl/to_ast.py +30 -13
  22. cudf_polars/dsl/tracing.py +194 -0
  23. cudf_polars/dsl/translate.py +307 -107
  24. cudf_polars/dsl/utils/aggregations.py +43 -30
  25. cudf_polars/dsl/utils/reshape.py +14 -2
  26. cudf_polars/dsl/utils/rolling.py +12 -8
  27. cudf_polars/dsl/utils/windows.py +35 -20
  28. cudf_polars/experimental/base.py +55 -2
  29. cudf_polars/experimental/benchmarks/pdsds.py +12 -126
  30. cudf_polars/experimental/benchmarks/pdsh.py +792 -2
  31. cudf_polars/experimental/benchmarks/utils.py +596 -39
  32. cudf_polars/experimental/dask_registers.py +47 -20
  33. cudf_polars/experimental/dispatch.py +9 -3
  34. cudf_polars/experimental/distinct.py +2 -0
  35. cudf_polars/experimental/explain.py +15 -2
  36. cudf_polars/experimental/expressions.py +30 -15
  37. cudf_polars/experimental/groupby.py +25 -4
  38. cudf_polars/experimental/io.py +156 -124
  39. cudf_polars/experimental/join.py +53 -23
  40. cudf_polars/experimental/parallel.py +68 -19
  41. cudf_polars/experimental/rapidsmpf/__init__.py +8 -0
  42. cudf_polars/experimental/rapidsmpf/collectives/__init__.py +9 -0
  43. cudf_polars/experimental/rapidsmpf/collectives/allgather.py +90 -0
  44. cudf_polars/experimental/rapidsmpf/collectives/common.py +96 -0
  45. cudf_polars/experimental/rapidsmpf/collectives/shuffle.py +253 -0
  46. cudf_polars/experimental/rapidsmpf/core.py +488 -0
  47. cudf_polars/experimental/rapidsmpf/dask.py +172 -0
  48. cudf_polars/experimental/rapidsmpf/dispatch.py +153 -0
  49. cudf_polars/experimental/rapidsmpf/io.py +696 -0
  50. cudf_polars/experimental/rapidsmpf/join.py +322 -0
  51. cudf_polars/experimental/rapidsmpf/lower.py +74 -0
  52. cudf_polars/experimental/rapidsmpf/nodes.py +735 -0
  53. cudf_polars/experimental/rapidsmpf/repartition.py +216 -0
  54. cudf_polars/experimental/rapidsmpf/union.py +115 -0
  55. cudf_polars/experimental/rapidsmpf/utils.py +374 -0
  56. cudf_polars/experimental/repartition.py +9 -2
  57. cudf_polars/experimental/select.py +177 -14
  58. cudf_polars/experimental/shuffle.py +46 -12
  59. cudf_polars/experimental/sort.py +100 -26
  60. cudf_polars/experimental/spilling.py +1 -1
  61. cudf_polars/experimental/statistics.py +24 -5
  62. cudf_polars/experimental/utils.py +25 -7
  63. cudf_polars/testing/asserts.py +13 -8
  64. cudf_polars/testing/io.py +2 -1
  65. cudf_polars/testing/plugin.py +93 -17
  66. cudf_polars/typing/__init__.py +86 -32
  67. cudf_polars/utils/config.py +473 -58
  68. cudf_polars/utils/cuda_stream.py +70 -0
  69. cudf_polars/utils/versions.py +5 -4
  70. cudf_polars_cu13-26.2.0.dist-info/METADATA +181 -0
  71. cudf_polars_cu13-26.2.0.dist-info/RECORD +108 -0
  72. {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/WHEEL +1 -1
  73. cudf_polars_cu13-25.10.0.dist-info/METADATA +0 -136
  74. cudf_polars_cu13-25.10.0.dist-info/RECORD +0 -92
  75. {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/licenses/LICENSE +0 -0
  76. {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
4
  """Plugin for running polars test suite setting GPU engine as default."""
@@ -12,8 +12,12 @@ import pytest
12
12
 
13
13
  import polars
14
14
 
15
+ from cudf_polars.utils.config import StreamingFallbackMode
16
+ from cudf_polars.utils.versions import POLARS_VERSION_LT_135
17
+
15
18
  if TYPE_CHECKING:
16
19
  from collections.abc import Mapping
20
+ from typing import Any
17
21
 
18
22
 
19
23
  def pytest_addoption(parser: pytest.Parser) -> None:
@@ -26,17 +30,50 @@ def pytest_addoption(parser: pytest.Parser) -> None:
26
30
  action="store_true",
27
31
  help="Turn off fallback to CPU when running tests (default use fallback)",
28
32
  )
33
+ group.addoption(
34
+ "--executor",
35
+ action="store",
36
+ default="in-memory",
37
+ choices=("in-memory", "streaming"),
38
+ help="Executor to use for GPUEngine.",
39
+ )
40
+ group.addoption(
41
+ "--blocksize-mode",
42
+ action="store",
43
+ default="default",
44
+ choices=("small", "default"),
45
+ help=(
46
+ "Blocksize to use for 'streaming' executor. Set to 'small' "
47
+ "to run most tests with multiple partitions."
48
+ ),
49
+ )
29
50
 
30
51
 
31
52
  def pytest_configure(config: pytest.Config) -> None:
32
53
  """Enable use of this module as a pytest plugin to enable GPU collection."""
33
54
  no_fallback = config.getoption("--cudf-polars-no-fallback")
55
+ executor = config.getoption("--executor")
56
+ blocksize_mode = config.getoption("--blocksize-mode")
34
57
  if no_fallback:
35
58
  collect = polars.LazyFrame.collect
36
59
  engine = polars.GPUEngine(raise_on_fail=no_fallback)
37
60
  # https://github.com/python/mypy/issues/2427
38
- polars.LazyFrame.collect = partialmethod(collect, engine=engine) # type: ignore[method-assign,assignment]
61
+ polars.LazyFrame.collect = partialmethod(collect, engine=engine) # type: ignore[method-assign, assignment]
62
+ elif executor == "in-memory":
63
+ collect = polars.LazyFrame.collect
64
+ engine = polars.GPUEngine(executor=executor)
65
+ polars.LazyFrame.collect = partialmethod(collect, engine=engine) # type: ignore[method-assign, assignment]
66
+ elif executor == "streaming" and blocksize_mode == "small":
67
+ executor_options: dict[str, Any] = {}
68
+ executor_options["max_rows_per_partition"] = 4
69
+ executor_options["target_partition_size"] = 10
70
+ # We expect many tests to fall back, so silence the warnings
71
+ executor_options["fallback_mode"] = StreamingFallbackMode.SILENT
72
+ collect = polars.LazyFrame.collect
73
+ engine = polars.GPUEngine(executor=executor, executor_options=executor_options)
74
+ polars.LazyFrame.collect = partialmethod(collect, engine=engine) # type: ignore[method-assign, assignment]
39
75
  else:
76
+ # run with streaming executor and default blocksize
40
77
  polars.Config.set_engine_affinity("gpu")
41
78
  config.addinivalue_line(
42
79
  "filterwarnings",
@@ -57,10 +94,13 @@ EXPECTED_FAILURES: Mapping[str, str | tuple[str, bool]] = {
57
94
  "tests/unit/io/test_delta.py::test_scan_delta_schema_evolution_nested_struct_field_19915": "Need to expose hive partitioning",
58
95
  "tests/unit/io/test_delta.py::test_scan_delta_nanosecond_timestamp": "polars generates the wrong schema: https://github.com/pola-rs/polars/issues/23949",
59
96
  "tests/unit/io/test_delta.py::test_scan_delta_nanosecond_timestamp_nested": "polars generates the wrong schema: https://github.com/pola-rs/polars/issues/23949",
97
+ "tests/unit/io/test_delta.py::test_scan_delta_loads_aws_profile_endpoint_url": (
98
+ "See https://github.com/rapidsai/cudf/pull/20791#issuecomment-3750528419",
99
+ not POLARS_VERSION_LT_135,
100
+ ),
60
101
  "tests/unit/io/test_lazy_count_star.py::test_count_compressed_csv_18057": "Need to determine if file is compressed",
61
102
  "tests/unit/io/test_lazy_count_star.py::test_count_parquet[small.parquet-4]": "Debug output on stderr doesn't match",
62
103
  "tests/unit/io/test_lazy_count_star.py::test_count_parquet[foods*.parquet-54]": "Debug output on stderr doesn't match",
63
- "tests/unit/io/test_lazy_csv.py::test_scan_csv_slice_offset_zero": "Integer overflow in sliced read",
64
104
  "tests/unit/io/test_lazy_parquet.py::test_parquet_is_in_statistics": "Debug output on stderr doesn't match",
65
105
  "tests/unit/io/test_lazy_parquet.py::test_parquet_statistics": "Debug output on stderr doesn't match",
66
106
  "tests/unit/io/test_partition.py::test_partition_to_memory[io_type0]": "partition sinks not yet supported in standard engine.",
@@ -112,17 +152,12 @@ EXPECTED_FAILURES: Mapping[str, str | tuple[str, bool]] = {
112
152
  "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-columns]": "Mismatching column read cudf#16394",
113
153
  "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-columns]": "Mismatching column read cudf#16394",
114
154
  "tests/unit/io/test_parquet.py::test_scan_parquet_filter_statistics_load_missing_column_21391": "Mismatching column read cudf#16394",
115
- "tests/unit/io/test_parquet.py::test_field_overwrites_metadata": "cannot serialize in-memory sink target.",
116
- "tests/unit/io/test_parquet_field_overwrites.py::test_required_flat": "cannot serialize in-memory sink target.",
117
- "tests/unit/io/test_parquet_field_overwrites.py::test_required_list[dtype0]": "cannot serialize in-memory sink target.",
118
- "tests/unit/io/test_parquet_field_overwrites.py::test_required_list[dtype1]": "cannot serialize in-memory sink target.",
119
- "tests/unit/io/test_parquet_field_overwrites.py::test_required_struct": "cannot serialize in-memory sink target.",
155
+ "tests/unit/io/test_parquet.py::test_binary_offset_roundtrip": "binary offset type unsupported",
120
156
  "tests/unit/lazyframe/test_engine_selection.py::test_engine_import_error_raises[gpu]": "Expect this to pass because cudf-polars is installed",
121
157
  "tests/unit/lazyframe/test_engine_selection.py::test_engine_import_error_raises[engine1]": "Expect this to pass because cudf-polars is installed",
122
158
  "tests/unit/lazyframe/test_lazyframe.py::test_round[dtype1-123.55-1-123.6]": "Rounding midpoints is handled incorrectly",
123
159
  "tests/unit/lazyframe/test_lazyframe.py::test_cast_frame": "Casting that raises not supported on GPU",
124
160
  "tests/unit/lazyframe/test_lazyframe.py::test_lazy_cache_hit": "Debug output on stderr doesn't match",
125
- "tests/unit/lazyframe/test_collect_schema.py::test_collect_schema_parametric": "polars returns decimal column with precision=None",
126
161
  "tests/unit/operations/aggregation/test_aggregations.py::test_binary_op_agg_context_no_simplify_expr_12423": "groupby-agg of just literals should not produce collect_list",
127
162
  "tests/unit/operations/aggregation/test_aggregations.py::test_nan_inf_aggregation": "treatment of nans and nulls together is different in libcudf and polars in groupby-agg context",
128
163
  "tests/unit/operations/test_abs.py::test_abs_duration": "Need to raise for unsupported uops on timelike values",
@@ -143,24 +178,26 @@ EXPECTED_FAILURES: Mapping[str, str | tuple[str, bool]] = {
143
178
  "tests/unit/operations/test_group_by.py::test_group_by_series_lit_22103[False]": "Incorrect broadcasting of literals in groupby-agg",
144
179
  "tests/unit/operations/test_group_by.py::test_group_by_series_lit_22103[True]": "Incorrect broadcasting of literals in groupby-agg",
145
180
  "tests/unit/operations/test_join.py::test_cross_join_slice_pushdown": "Need to implement slice pushdown for cross joins",
146
- "tests/unit/operations/test_join.py::test_join_filter_pushdown_iejoin": "Row order differs due to multiple matches per left row index; join results are correct but unsorted",
181
+ # We match the behavior of the polars[cpu] streaming engine (it makes doesn't make any ordering guarantees either when maintain_order is none).
182
+ # But this test does because the test is run with the polars[cpu] in-memory engine, which still preserves the order of the left dataframe
183
+ # when maintain order is none.
184
+ "tests/unit/operations/test_join.py::test_join_preserve_order_left": "polars[gpu] makes no ordering guarantees when maintain_order is none",
185
+ # TODO: As of polars 1.34, the column names for left and right came in unaligned, which causes the dtypes to mismatch when calling plc.replace.replace_nulls
186
+ # Need to investigate what changed in polars
187
+ "tests/unit/operations/test_join.py::test_join_coalesce_column_order_23177": "Misaligned left/right column names left and right tables in join op",
147
188
  "tests/unit/operations/namespaces/string/test_pad.py::test_str_zfill_unicode_not_respected": "polars doesn't add zeros for unicode characters.",
148
189
  "tests/unit/sql/test_cast.py::test_cast_errors[values0-values::uint8-conversion from `f64` to `u64` failed]": "Casting that raises not supported on GPU",
149
190
  "tests/unit/sql/test_cast.py::test_cast_errors[values1-values::uint4-conversion from `i64` to `u32` failed]": "Casting that raises not supported on GPU",
150
191
  "tests/unit/sql/test_cast.py::test_cast_errors[values2-values::int1-conversion from `i64` to `i8` failed]": "Casting that raises not supported on GPU",
151
192
  "tests/unit/sql/test_cast.py::test_cast_errors[values5-values::int4-conversion from `str` to `i32` failed]": "Cast raises, but error user receives is wrong",
152
193
  "tests/unit/sql/test_miscellaneous.py::test_read_csv": "Incorrect handling of missing_is_null in read_csv",
153
- "tests/unit/sql/test_wildcard_opts.py::test_select_wildcard_errors": "Raises correctly but with different exception",
154
194
  "tests/unit/test_cse.py::test_cse_predicate_self_join": "Debug output on stderr doesn't match",
155
195
  "tests/unit/test_cse.py::test_nested_cache_no_panic_16553": "Needs https://github.com/rapidsai/cudf/issues/18630",
156
196
  "tests/unit/test_errors.py::test_error_on_empty_group_by": "Incorrect exception raised",
157
197
  "tests/unit/test_predicates.py::test_predicate_pushdown_split_pushable": "Casting that raises not supported on GPU",
158
198
  "tests/unit/io/test_scan_row_deletion.py::test_scan_row_deletion_skips_file_with_all_rows_deleted": "The test intentionally corrupts the parquet file, so we cannot read the row count from the header.",
159
199
  "tests/unit/io/test_multiscan.py::test_multiscan_row_index[scan_csv-write_csv-csv]": "Debug output on stderr doesn't match",
160
- "tests/unit/functions/range/test_linear_space.py::test_linear_space_date": "Needs https://github.com/pola-rs/polars/issues/23020",
161
- "tests/unit/sql/test_temporal.py::test_implicit_temporal_strings[dt IN ('1960-01-07','2077-01-01','2222-02-22')-expected15]": "Needs https://github.com/pola-rs/polars/issues/23020",
162
- "tests/unit/sql/test_operators.py::test_in_not_in[dt NOT IN ('1950-12-24', '1997-07-05')]": "Needs https://github.com/pola-rs/polars/issues/23020",
163
- "tests/unit/sql/test_operators.py::test_in_not_in[dt IN ('2020-10-10', '2077-03-18')]": "Needs https://github.com/pola-rs/polars/issues/23020",
200
+ "tests/unit/datatypes/test_decimal.py::test_decimal_aggregations": "https://github.com/rapidsai/cudf/issues/20508",
164
201
  "tests/unit/datatypes/test_struct.py::test_struct_agg_all": "Needs nested list[struct] support",
165
202
  "tests/unit/constructors/test_structs.py::test_constructor_non_strict_schema_17956": "Needs nested list[struct] support",
166
203
  "tests/unit/io/test_delta.py::test_read_delta_arrow_map_type": "Needs nested list[struct] support",
@@ -174,8 +211,31 @@ EXPECTED_FAILURES: Mapping[str, str | tuple[str, bool]] = {
174
211
  "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-row_groups]": "allow_missing_columns argument in read_parquet not translated in IR",
175
212
  "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-prefiltered]": "allow_missing_columns argument in read_parquet not translated in IR",
176
213
  "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-none]": "allow_missing_columns argument in read_parquet not translated in IR",
177
- "tests/unit/datatypes/test_decimal.py::test_decimal_aggregations": "https://github.com/pola-rs/polars/issues/23899",
178
- "tests/unit/datatypes/test_decimal.py::test_decimal_arithmetic_schema": "https://github.com/pola-rs/polars/issues/23899",
214
+ "tests/unit/test_cse.py::test_cse_predicate_self_join[False]": "polars removed the refcount in the logical plan",
215
+ "tests/unit/io/test_multiscan.py::test_multiscan_row_index[scan_csv-write_csv]": "CSV multiscan with row_index and no row limit is not yet supported.",
216
+ "tests/unit/io/test_scan.py::test_scan_empty_paths_friendly_error[scan_parquet-failed to retrieve first file schema (parquet)-'parquet scan']": "Debug output on stderr doesn't match",
217
+ "tests/unit/io/test_scan.py::test_scan_empty_paths_friendly_error[scan_ipc-failed to retrieve first file schema (ipc)-'ipc scan']": "Debug output on stderr doesn't match",
218
+ "tests/unit/io/test_scan.py::test_scan_empty_paths_friendly_error[scan_csv-failed to retrieve file schemas (csv)-'csv scan']": "Debug output on stderr doesn't match",
219
+ "tests/unit/io/test_scan.py::test_scan_empty_paths_friendly_error[scan_ndjson-failed to retrieve first file schema (ndjson)-'ndjson scan']": "Debug output on stderr doesn't match",
220
+ "tests/unit/operations/test_slice.py::test_schema_gather_get_on_literal_24101[lit1-idx2-False]": "Aggregating a list literal: cudf#19610",
221
+ "tests/unit/operations/test_slice.py::test_schema_gather_get_on_literal_24101[lit2-idx2-False]": "Aggregating a list literal: cudf#19610",
222
+ "tests/unit/operations/test_slice.py::test_schema_gather_get_on_literal_24101[lit1-0-False]": "Aggregating a list literal: cudf#19610",
223
+ "tests/unit/operations/test_slice.py::test_schema_gather_get_on_literal_24101[lit1-idx1-False]": "Aggregating a list literal: cudf#19610",
224
+ "tests/unit/operations/test_slice.py::test_schema_gather_get_on_literal_24101[lit2-0-False]": "Aggregating a list literal: cudf#19610",
225
+ "tests/unit/operations/test_slice.py::test_schema_gather_get_on_literal_24101[lit2-idx1-False]": "Aggregating a list literal: cudf#19610",
226
+ "tests/unit/operations/test_slice.py::test_schema_head_tail_on_literal_24102[lit1-1-False]": "Aggregating a list literal: cudf#19610",
227
+ "tests/unit/operations/test_slice.py::test_schema_head_tail_on_literal_24102[lit1-len1-False]": "Aggregating a list literal: cudf#19610",
228
+ "tests/unit/operations/test_slice.py::test_schema_head_tail_on_literal_24102[lit2-1-False]": "Aggregating a list literal: cudf#19610",
229
+ "tests/unit/operations/test_slice.py::test_schema_head_tail_on_literal_24102[lit2-len1-False]": "Aggregating a list literal: cudf#19610",
230
+ "tests/unit/operations/test_slice.py::test_schema_slice_on_literal_23999[lit2-offset1-0-False]": "Aggregating a list literal: cudf#19610",
231
+ "tests/unit/operations/test_slice.py::test_schema_slice_on_literal_23999[lit2-offset1-len1-False]": "Aggregating a list literal: cudf#19610",
232
+ "tests/unit/operations/test_slice.py::test_schema_slice_on_literal_23999[lit1-0-len1-False]": "Aggregating a list literal: cudf#19610",
233
+ "tests/unit/operations/test_slice.py::test_schema_slice_on_literal_23999[lit1-offset1-0-False]": "Aggregating a list literal: cudf#19610",
234
+ "tests/unit/operations/test_slice.py::test_schema_slice_on_literal_23999[lit1-offset1-len1-False]": "Aggregating a list literal: cudf#19610",
235
+ "tests/unit/operations/test_slice.py::test_schema_slice_on_literal_23999[lit2-0-0-False]": "Aggregating a list literal: cudf#19610",
236
+ "tests/unit/operations/test_slice.py::test_schema_slice_on_literal_23999[lit2-0-len1-False]": "Aggregating a list literal: cudf#19610",
237
+ "tests/unit/operations/test_slice.py::test_schema_slice_on_literal_23999[lit1-0-0-False]": "Aggregating a list literal: cudf#19610",
238
+ "tests/unit/operations/namespaces/test_binary.py::test_binary_compounded_literal_aggstate_24460": "Aggregating a list literal: cudf#19610",
179
239
  }
180
240
 
181
241
 
@@ -209,6 +269,16 @@ TESTS_TO_SKIP: Mapping[str, str] = {
209
269
  "tests/unit/streaming/test_streaming.py::test_streaming_apply": "https://github.com/pola-rs/polars/issues/22558",
210
270
  # New iceberg release causes this test to fail. We can remove in the next polars version bump: https://github.com/rapidsai/cudf/pull/19912
211
271
  "tests/unit/io/test_iceberg.py::test_fill_missing_fields_with_identity_partition_values[False]": "https://github.com/pola-rs/polars/pull/24456",
272
+ "tests/unit/operations/test_rolling.py::test_rolling_agg_bad_input_types[str]": "https://github.com/rapidsai/cudf/issues/20551",
273
+ }
274
+
275
+
276
+ STREAMING_ONLY_EXPECTED_FAILURES: Mapping[str, str] = {
277
+ "tests/unit/io/test_parquet.py::test_field_overwrites_metadata": "cannot serialize in-memory sink target.",
278
+ "tests/unit/io/test_parquet_field_overwrites.py::test_required_flat": "cannot serialize in-memory sink target.",
279
+ "tests/unit/io/test_parquet_field_overwrites.py::test_required_list[dtype0]": "cannot serialize in-memory sink target.",
280
+ "tests/unit/io/test_parquet_field_overwrites.py::test_required_list[dtype1]": "cannot serialize in-memory sink target.",
281
+ "tests/unit/io/test_parquet_field_overwrites.py::test_required_struct": "cannot serialize in-memory sink target.",
212
282
  }
213
283
 
214
284
 
@@ -222,6 +292,12 @@ def pytest_collection_modifyitems(
222
292
  for item in items:
223
293
  if (reason := TESTS_TO_SKIP.get(item.nodeid, None)) is not None:
224
294
  item.add_marker(pytest.mark.skip(reason=reason))
295
+ elif (
296
+ config.getoption("--executor") == "streaming"
297
+ and (s_reason := STREAMING_ONLY_EXPECTED_FAILURES.get(item.nodeid, None))
298
+ is not None
299
+ ):
300
+ item.add_marker(pytest.mark.xfail(reason=s_reason))
225
301
  elif (entry := EXPECTED_FAILURES.get(item.nodeid, None)) is not None:
226
302
  if isinstance(entry, tuple):
227
303
  # the second entry in the tuple is the condition to xfail on
@@ -19,7 +19,7 @@ from typing import (
19
19
 
20
20
  import polars as pl
21
21
  import polars.datatypes
22
- from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
22
+ from polars import polars as plrs # type: ignore[attr-defined]
23
23
 
24
24
  if TYPE_CHECKING:
25
25
  from collections.abc import Callable
@@ -48,44 +48,45 @@ __all__: list[str] = [
48
48
  "OptimizationArgs",
49
49
  "PolarsExpr",
50
50
  "PolarsIR",
51
+ "RankMethod",
51
52
  "Schema",
52
53
  "Slice",
53
54
  ]
54
55
 
55
56
  PolarsIR: TypeAlias = Union[
56
- pl_ir.PythonScan,
57
- pl_ir.Scan,
58
- pl_ir.Cache,
59
- pl_ir.DataFrameScan,
60
- pl_ir.Select,
61
- pl_ir.GroupBy,
62
- pl_ir.Join,
63
- pl_ir.HStack,
64
- pl_ir.Distinct,
65
- pl_ir.Sort,
66
- pl_ir.Slice,
67
- pl_ir.Filter,
68
- pl_ir.SimpleProjection,
69
- pl_ir.MapFunction,
70
- pl_ir.Union,
71
- pl_ir.HConcat,
72
- pl_ir.ExtContext,
57
+ plrs._ir_nodes.PythonScan,
58
+ plrs._ir_nodes.Scan,
59
+ plrs._ir_nodes.Cache,
60
+ plrs._ir_nodes.DataFrameScan,
61
+ plrs._ir_nodes.Select,
62
+ plrs._ir_nodes.GroupBy,
63
+ plrs._ir_nodes.Join,
64
+ plrs._ir_nodes.HStack,
65
+ plrs._ir_nodes.Distinct,
66
+ plrs._ir_nodes.Sort,
67
+ plrs._ir_nodes.Slice,
68
+ plrs._ir_nodes.Filter,
69
+ plrs._ir_nodes.SimpleProjection,
70
+ plrs._ir_nodes.MapFunction,
71
+ plrs._ir_nodes.Union,
72
+ plrs._ir_nodes.HConcat,
73
+ plrs._ir_nodes.ExtContext,
73
74
  ]
74
75
 
75
76
  PolarsExpr: TypeAlias = Union[
76
- pl_expr.Function,
77
- pl_expr.Window,
78
- pl_expr.Literal,
79
- pl_expr.Sort,
80
- pl_expr.SortBy,
81
- pl_expr.Gather,
82
- pl_expr.Filter,
83
- pl_expr.Cast,
84
- pl_expr.Column,
85
- pl_expr.Agg,
86
- pl_expr.BinaryExpr,
87
- pl_expr.Len,
88
- pl_expr.PyExprIR,
77
+ plrs._expr_nodes.Function,
78
+ plrs._expr_nodes.Window,
79
+ plrs._expr_nodes.Literal,
80
+ plrs._expr_nodes.Sort,
81
+ plrs._expr_nodes.SortBy,
82
+ plrs._expr_nodes.Gather,
83
+ plrs._expr_nodes.Filter,
84
+ plrs._expr_nodes.Cast,
85
+ plrs._expr_nodes.Column,
86
+ plrs._expr_nodes.Agg,
87
+ plrs._expr_nodes.BinaryExpr,
88
+ plrs._expr_nodes.Len,
89
+ plrs._expr_nodes.PyExprIR,
89
90
  ]
90
91
 
91
92
  PolarsSchema: TypeAlias = dict[str, pl.DataType]
@@ -173,6 +174,53 @@ class GenericTransformer(Protocol[U_contra, V_co, StateT_co]):
173
174
  ...
174
175
 
175
176
 
177
+ class _ScalarDataTypeHeader(TypedDict):
178
+ kind: Literal["scalar"]
179
+ name: str
180
+
181
+
182
+ class _DecimalDataTypeHeader(TypedDict):
183
+ kind: Literal["decimal"]
184
+ precision: int
185
+ scale: int
186
+
187
+
188
+ class _DatetimeDataTypeHeader(TypedDict):
189
+ kind: Literal["datetime"]
190
+ time_unit: str
191
+ time_zone: str | None
192
+
193
+
194
+ class _DurationDataTypeHeader(TypedDict):
195
+ kind: Literal["duration"]
196
+ time_unit: str
197
+
198
+
199
+ class _ListDataTypeHeader(TypedDict):
200
+ kind: Literal["list"]
201
+ inner: DataTypeHeader
202
+
203
+
204
+ class _StructFieldHeader(TypedDict):
205
+ name: str
206
+ dtype: DataTypeHeader
207
+
208
+
209
+ class _StructDataTypeHeader(TypedDict):
210
+ kind: Literal["struct"]
211
+ fields: list[_StructFieldHeader]
212
+
213
+
214
+ DataTypeHeader = (
215
+ _ScalarDataTypeHeader
216
+ | _DecimalDataTypeHeader
217
+ | _DatetimeDataTypeHeader
218
+ | _DurationDataTypeHeader
219
+ | _ListDataTypeHeader
220
+ | _StructDataTypeHeader
221
+ )
222
+
223
+
176
224
  class ColumnOptions(TypedDict):
177
225
  """
178
226
  Column constructor options.
@@ -186,7 +234,7 @@ class ColumnOptions(TypedDict):
186
234
  order: plc.types.Order
187
235
  null_order: plc.types.NullOrder
188
236
  name: str | None
189
- dtype: str
237
+ dtype: DataTypeHeader
190
238
 
191
239
 
192
240
  class DeserializedColumnOptions(TypedDict):
@@ -217,3 +265,9 @@ class DataFrameHeader(TypedDict):
217
265
 
218
266
  columns_kwargs: list[ColumnOptions]
219
267
  frame_count: int
268
+
269
+
270
+ # Not public in polars yet
271
+ RankMethod = Literal["ordinal", "dense", "min", "max", "average"]
272
+
273
+ RoundMethod = Literal["half_away_from_zero", "half_to_even"]