PyPI - cudf-polars-cu13 - Versions diffs - 25.12.0__py3-none-any.whl → 26.2.0__py3-none-any.whl - Mend

cudf-polars-cu13 25.12.0py3-none-any.whl → 26.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

cudf_polars/GIT_COMMIT +1 -1
cudf_polars/VERSION +1 -1
cudf_polars/callback.py +28 -7
cudf_polars/containers/column.py +51 -26
cudf_polars/dsl/expressions/binaryop.py +1 -1
cudf_polars/dsl/expressions/boolean.py +1 -1
cudf_polars/dsl/expressions/selection.py +1 -1
cudf_polars/dsl/expressions/string.py +29 -20
cudf_polars/dsl/expressions/ternary.py +25 -1
cudf_polars/dsl/expressions/unary.py +11 -8
cudf_polars/dsl/ir.py +351 -281
cudf_polars/dsl/translate.py +18 -15
cudf_polars/dsl/utils/aggregations.py +10 -5
cudf_polars/experimental/base.py +10 -0
cudf_polars/experimental/benchmarks/pdsh.py +1 -1
cudf_polars/experimental/benchmarks/utils.py +83 -2
cudf_polars/experimental/distinct.py +2 -0
cudf_polars/experimental/explain.py +1 -1
cudf_polars/experimental/expressions.py +8 -5
cudf_polars/experimental/groupby.py +2 -0
cudf_polars/experimental/io.py +64 -42
cudf_polars/experimental/join.py +15 -2
cudf_polars/experimental/parallel.py +10 -7
cudf_polars/experimental/rapidsmpf/collectives/__init__.py +9 -0
cudf_polars/experimental/rapidsmpf/collectives/allgather.py +90 -0
cudf_polars/experimental/rapidsmpf/collectives/common.py +96 -0
cudf_polars/experimental/rapidsmpf/{shuffle.py → collectives/shuffle.py} +90 -114
cudf_polars/experimental/rapidsmpf/core.py +194 -67
cudf_polars/experimental/rapidsmpf/dask.py +172 -0
cudf_polars/experimental/rapidsmpf/dispatch.py +6 -3
cudf_polars/experimental/rapidsmpf/io.py +162 -70
cudf_polars/experimental/rapidsmpf/join.py +162 -77
cudf_polars/experimental/rapidsmpf/nodes.py +421 -180
cudf_polars/experimental/rapidsmpf/repartition.py +130 -65
cudf_polars/experimental/rapidsmpf/union.py +24 -5
cudf_polars/experimental/rapidsmpf/utils.py +228 -16
cudf_polars/experimental/shuffle.py +18 -4
cudf_polars/experimental/sort.py +13 -6
cudf_polars/experimental/spilling.py +1 -1
cudf_polars/testing/plugin.py +6 -3
cudf_polars/utils/config.py +67 -0
cudf_polars/utils/versions.py +3 -3
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/METADATA +9 -10
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/RECORD +47 -43
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/WHEEL +1 -1
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/licenses/LICENSE +0 -0
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/top_level.txt +0 -0

cudf_polars/dsl/translate.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 """Translate polars IR representation to ours."""
@@ -102,7 +102,7 @@ class Translator:
         # IR is versioned with major.minor, minor is bumped for backwards
         # compatible changes (e.g. adding new nodes), major is bumped for
         # incompatible changes (e.g. renaming nodes).
-        if (version := self.visitor.version()) >= (10, 1):
+        if (version := self.visitor.version()) >= (11, 1):
             e = NotImplementedError(
                 f"No support for polars IR {version=}"
             )  # pragma: no cover; no such version for now.
@@ -379,12 +379,12 @@ def _align_decimal_scales(
         if (
             left_type.id() != target.id() or left_type.scale() != target.scale()
         ):  # pragma: no cover; no test yet
-            left = expr.Cast(target, left)
+            left = expr.Cast(target, True, left)  # noqa: FBT003
         if (
             right_type.id() != target.id() or right_type.scale() != target.scale()
         ):  # pragma: no cover; no test yet
-            right = expr.Cast(target, right)
+            right = expr.Cast(target, True, right)  # noqa: FBT003
     return left, right
@@ -746,7 +746,7 @@ def _(
             *(translator.translate_expr(n=n, schema=schema) for n in node.input),
         )
         if name in needs_cast:
-            return expr.Cast(dtype, result_expr)
+            return expr.Cast(dtype, True, result_expr)  # noqa: FBT003
         return result_expr
     elif not POLARS_VERSION_LT_131 and isinstance(
         name, plrs._expr_nodes.StructFunction
@@ -787,6 +787,7 @@ def _(
                     if not POLARS_VERSION_LT_134
                     else expr.Cast(
                         DataType(pl.Float64()),
+                        True,  # noqa: FBT003
                         res,
                     )
                 )
@@ -996,6 +997,9 @@ def _(
 def _(
     node: plrs._expr_nodes.Cast, translator: Translator, dtype: DataType, schema: Schema
 ) -> expr.Expr:
+    # TODO: node.options can be 2 meaning wrap_numerical=True
+    # don't necessarily raise because wrapping isn't always needed, but it's unhandled
+    strict = node.options != 1
     inner = translator.translate_expr(n=node.expr, schema=schema)
     if plc.traits.is_floating_point(inner.dtype.plc_type) and plc.traits.is_fixed_point(
@@ -1003,6 +1007,7 @@ def _(
     ):
         return expr.Cast(
             dtype,
+            strict,
             expr.UnaryFunction(
                 inner.dtype, "round", (-dtype.plc_type.scale(), "half_to_even"), inner
             ),
@@ -1011,11 +1016,8 @@ def _(
     # Push casts into literals so we can handle Cast(Literal(Null))
     if isinstance(inner, expr.Literal):
         return inner.astype(dtype)
-    elif isinstance(inner, expr.Cast):
-        # Translation of Len/Count-agg put in a cast, remove double
-        # casts if we have one.
-        (inner,) = inner.children
-    return expr.Cast(dtype, inner)
+    else:
+        return expr.Cast(dtype, strict, inner)
 @_translate_expr.register
@@ -1037,7 +1039,7 @@ def _(
     if agg_name not in ("count", "n_unique", "mean", "median", "quantile"):
         args = [
-            expr.Cast(dtype, arg)
+            expr.Cast(dtype, True, arg)  # noqa: FBT003
             if plc.traits.is_fixed_point(arg.dtype.plc_type)
             and arg.dtype.plc_type != dtype.plc_type
             else arg
@@ -1047,7 +1049,7 @@ def _(
     value = expr.Agg(dtype, agg_name, node.options, translator._expr_context, *args)
     if agg_name in ("count", "n_unique") and value.dtype.id() != plc.TypeId.INT32:
-        return expr.Cast(value.dtype, value)
+        return expr.Cast(value.dtype, True, value)  # noqa: FBT003
     return value
@@ -1088,11 +1090,12 @@ def _(
         f64 = DataType(pl.Float64())
         return expr.Cast(
             dtype,
+            True,  # noqa: FBT003
             expr.BinOp(
                 f64,
                 expr.BinOp._MAPPING[node.op],
-                expr.Cast(f64, left),
-                expr.Cast(f64, right),
+                expr.Cast(f64, True, left),  # noqa: FBT003
+                expr.Cast(f64, True, right),  # noqa: FBT003
             ),
         )
@@ -1132,5 +1135,5 @@ def _(
 ) -> expr.Expr:
     value = expr.Len(dtype)
     if dtype.id() != plc.TypeId.INT32:
-        return expr.Cast(dtype, value)
+        return expr.Cast(dtype, True, value)  # noqa: FBT003
     return value  # pragma: no cover; never reached since polars len has uint32 dtype

cudf_polars/dsl/utils/aggregations.py CHANGED Viewed

@@ -115,7 +115,7 @@ def decompose_single_agg(
         # - min/max/dense/ordinal -> IDX_DTYPE (UInt32/UInt64)
         post_col: expr.Expr = expr.Col(agg.dtype, name)
         if agg.name == "rank":
-            post_col = expr.Cast(agg.dtype, post_col)
+            post_col = expr.Cast(agg.dtype, True, post_col)  # noqa: FBT003
         return [(named_expr, True)], named_expr.reconstruct(post_col)
     if isinstance(agg, expr.UnaryFunction) and agg.name == "null_count":
@@ -131,10 +131,10 @@ def decompose_single_agg(
         sum_name = next(name_generator)
         sum_agg = expr.NamedExpr(
             sum_name,
-            expr.Agg(u32, "sum", (), context, expr.Cast(u32, is_null_bool)),
+            expr.Agg(u32, "sum", (), context, expr.Cast(u32, True, is_null_bool)),  # noqa: FBT003
         )
         return [(sum_agg, True)], named_expr.reconstruct(
-            expr.Cast(u32, expr.Col(u32, sum_name))
+            expr.Cast(u32, True, expr.Col(u32, sum_name))  # noqa: FBT003
         )
     if isinstance(agg, expr.Col):
         # TODO: collect_list produces null for empty group in libcudf, empty list in polars.
@@ -201,6 +201,7 @@ def decompose_single_agg(
                     agg.dtype
                     if plc.traits.is_floating_point(agg.dtype.plc_type)
                     else DataType(pl.Float64()),
+                    True,  # noqa: FBT003
                     child,
                 )
                 child_dtype = child.dtype.plc_type
@@ -229,7 +230,11 @@ def decompose_single_agg(
         if agg.name == "sum":
             col = (
-                expr.Cast(agg.dtype, expr.Col(DataType(pl.datatypes.Int64()), name))
+                expr.Cast(
+                    agg.dtype,
+                    True,  # noqa: FBT003
+                    expr.Col(DataType(pl.datatypes.Int64()), name),
+                )
                 if (
                     plc.traits.is_integral(agg.dtype.plc_type)
                     and agg.dtype.id() != plc.TypeId.INT64
@@ -282,7 +287,7 @@ def decompose_single_agg(
             )  # libcudf promotes to float64
             if agg.dtype.plc_type.id() == plc.TypeId.FLOAT32:
                 # Cast back to float32 to match Polars
-                post_agg_col = expr.Cast(agg.dtype, post_agg_col)
+                post_agg_col = expr.Cast(agg.dtype, True, post_agg_col)  # noqa: FBT003
             return [(named_expr, True)], named_expr.reconstruct(post_agg_col)
         else:
             return [(named_expr, True)], named_expr.reconstruct(

cudf_polars/experimental/base.py CHANGED Viewed

@@ -115,6 +115,7 @@ class DataSourceInfo:
     """
     _unique_stats_columns: set[str]
+    _read_columns: set[str]
     @property
     def row_count(self) -> ColumnStat[int]:  # pragma: no cover
@@ -141,6 +142,10 @@ class DataSourceInfo:
         """Add a column needing unique-value information."""
         self._unique_stats_columns.add(column)
+    def add_read_column(self, column: str) -> None:
+        """Add a column needing to be read."""
+        self._read_columns.add(column)
 class DataSourcePair(NamedTuple):
     """Pair of table-source and column-name information."""
@@ -240,6 +245,11 @@ class ColumnSourceInfo:
         for table_source, column_name in self.table_source_pairs:
             table_source.add_unique_stats_column(column or column_name)
+    def add_read_column(self, column: str | None = None) -> None:
+        """Add a column needing to be read."""
+        for table_source, column_name in self.table_source_pairs:
+            table_source.add_read_column(column or column_name)
 class ColumnStats:
     """

cudf_polars/experimental/benchmarks/pdsh.py CHANGED Viewed

@@ -610,7 +610,7 @@ class PDSHQueries:
         q1 = (
             part.filter(pl.col("p_brand") == var1)
             .filter(pl.col("p_container") == var2)
-            .join(lineitem, how="left", left_on="p_partkey", right_on="l_partkey")
+            .join(lineitem, how="inner", left_on="p_partkey", right_on="l_partkey")
         )
         return (

cudf_polars/experimental/benchmarks/utils.py CHANGED Viewed

@@ -256,6 +256,8 @@ class RunConfig:
     query_set: str
     collect_traces: bool = False
     stats_planning: bool
+    max_io_threads: int
+    native_parquet: bool
     def __post_init__(self) -> None:  # noqa: D105
         if self.gather_shuffle_stats and self.shuffle != "rapidsmpf":
@@ -371,6 +373,8 @@ class RunConfig:
             query_set=args.query_set,
             collect_traces=args.collect_traces,
             stats_planning=args.stats_planning,
+            max_io_threads=args.max_io_threads,
+            native_parquet=args.native_parquet,
         )
     def serialize(self, engine: pl.GPUEngine | None) -> dict:
@@ -400,6 +404,8 @@ class RunConfig:
                 print(f"shuffle_method: {self.shuffle}")
                 print(f"broadcast_join_limit: {self.broadcast_join_limit}")
                 print(f"stats_planning: {self.stats_planning}")
+                if self.runtime == "rapidsmpf":
+                    print(f"native_parquet: {self.native_parquet}")
                 if self.cluster == "distributed":
                     print(f"n_workers: {self.n_workers}")
                     print(f"threads: {self.threads}")
@@ -450,10 +456,16 @@ def get_executor_options(
             executor_options["rapidsmpf_spill"] = run_config.rapidsmpf_spill
         if run_config.cluster == "distributed":
             executor_options["cluster"] = "distributed"
-        if run_config.stats_planning:
-            executor_options["stats_planning"] = {"use_reduction_planning": True}
+        executor_options["stats_planning"] = {
+            "use_reduction_planning": run_config.stats_planning,
+            "use_sampling": (
+                # Always allow row-group sampling for rapidsmpf runtime
+                run_config.stats_planning or run_config.runtime == "rapidsmpf"
+            ),
+        }
         executor_options["client_device_threshold"] = run_config.spill_device
         executor_options["runtime"] = run_config.runtime
+        executor_options["max_io_threads"] = run_config.max_io_threads
     if (
         benchmark
@@ -879,6 +891,18 @@ def parse_args(
         default=False,
         help="Enable statistics planning.",
     )
+    parser.add_argument(
+        "--max-io-threads",
+        default=2,
+        type=int,
+        help="Maximum number of IO threads for rapidsmpf runtime.",
+    )
+    parser.add_argument(
+        "--native-parquet",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help="Use C++ read_parquet nodes for the rapidsmpf runtime.",
+    )
     parsed_args = parser.parse_args(args)
@@ -908,6 +932,12 @@ def run_polars(
     if run_config.executor != "cpu":
         executor_options = get_executor_options(run_config, benchmark=benchmark)
+        if run_config.runtime == "rapidsmpf":
+            parquet_options = {
+                "use_rapidsmpf_native": run_config.native_parquet,
+            }
+        else:
+            parquet_options = {}
         engine = pl.GPUEngine(
             raise_on_fail=True,
             memory_resource=rmm.mr.CudaAsyncMemoryResource()
@@ -916,6 +946,7 @@ def run_polars(
             cuda_stream_policy=run_config.stream_policy,
             executor=run_config.executor,
             executor_options=executor_options,
+            parquet_options=parquet_options,
         )
     for q_id in run_config.queries:
@@ -1163,6 +1194,45 @@ PDSH_TABLE_NAMES: list[str] = [
 ]
+def print_duckdb_plan(
+    q_id: int,
+    sql: str,
+    dataset_path: Path,
+    suffix: str,
+    query_set: str,
+    args: argparse.Namespace,
+) -> None:
+    """Print DuckDB query plan using EXPLAIN."""
+    if duckdb is None:
+        raise ImportError(duckdb_err)
+    if query_set == "pdsds":
+        tbl_names = PDSDS_TABLE_NAMES
+    else:
+        tbl_names = PDSH_TABLE_NAMES
+    with duckdb.connect() as conn:
+        for name in tbl_names:
+            pattern = (Path(dataset_path) / name).as_posix() + suffix
+            conn.execute(
+                f"CREATE OR REPLACE VIEW {name} AS "
+                f"SELECT * FROM parquet_scan('{pattern}');"
+            )
+        if args.explain_logical and args.explain:
+            conn.execute("PRAGMA explain_output = 'all';")
+        elif args.explain_logical:
+            conn.execute("PRAGMA explain_output = 'optimized_only';")
+        else:
+            conn.execute("PRAGMA explain_output = 'physical_only';")
+        print(f"\nDuckDB Query {q_id} - Plan\n")
+        plan_rows = conn.execute(f"EXPLAIN {sql}").fetchall()
+        for _, line in plan_rows:
+            print(line)
 def execute_duckdb_query(
     query: str,
     dataset_path: Path,
@@ -1203,6 +1273,17 @@ def run_duckdb(
             raise NotImplementedError(f"Query {q_id} not implemented.") from err
         sql = get_q(run_config)
+        if args.explain or args.explain_logical:
+            print_duckdb_plan(
+                q_id=q_id,
+                sql=sql,
+                dataset_path=run_config.dataset_path,
+                suffix=run_config.suffix,
+                query_set=duckdb_queries_cls.name,
+                args=args,
+            )
         print(f"DuckDB Executing: {q_id}")
         records[q_id] = []

cudf_polars/experimental/distinct.py CHANGED Viewed

@@ -97,6 +97,7 @@ def lower_distinct(
                 child.schema,
                 shuffle_keys,
                 config_options.executor.shuffle_method,
+                config_options.executor.shuffler_insertion_method,
                 child,
             )
             partition_info[child] = PartitionInfo(
@@ -150,6 +151,7 @@ def lower_distinct(
             new_node.schema,
             shuffle_keys,
             config_options.executor.shuffle_method,
+            config_options.executor.shuffler_insertion_method,
             new_node,
         )
         partition_info[new_node] = PartitionInfo(count=output_count)

cudf_polars/experimental/explain.py CHANGED Viewed

@@ -71,7 +71,7 @@ def explain_query(
             lowered_ir, partition_info, _ = rapidsmpf_lower_ir_graph(ir, config)
         else:
-            lowered_ir, partition_info = lower_ir_graph(ir, config)
+            lowered_ir, partition_info, _ = lower_ir_graph(ir, config)
         return _repr_ir_tree(lowered_ir, partition_info)
     else:
         if config.executor.name == "streaming":

cudf_polars/experimental/expressions.py CHANGED Viewed

@@ -41,7 +41,7 @@ from cudf_polars.dsl.expressions.aggregation import Agg
 from cudf_polars.dsl.expressions.base import Col, ExecutionContext, Expr, NamedExpr
 from cudf_polars.dsl.expressions.binaryop import BinOp
 from cudf_polars.dsl.expressions.literal import Literal
-from cudf_polars.dsl.expressions.unary import Cast, UnaryFunction
+from cudf_polars.dsl.expressions.unary import Cast, Len, UnaryFunction
 from cudf_polars.dsl.ir import IR, Distinct, Empty, HConcat, Select
 from cudf_polars.dsl.traversal import (
     CachingVisitor,
@@ -236,7 +236,7 @@ def _decompose_unique(
 def _decompose_agg_node(
-    agg: Agg,
+    agg: Agg | Len,
     input_ir: IR,
     partition_info: MutableMapping[IR, PartitionInfo],
     config_options: ConfigOptions,
@@ -272,7 +272,7 @@ def _decompose_agg_node(
     """
     expr: Expr
     exprs: list[Expr]
-    if agg.name == "count":
+    if isinstance(agg, Len) or agg.name == "count":
         # Chunkwise stage
         columns, input_ir, partition_info = select(
             [agg],
@@ -350,6 +350,7 @@ def _decompose_agg_node(
                 input_ir.schema,
                 shuffle_on,
                 config_options.executor.shuffle_method,
+                config_options.executor.shuffler_insertion_method,
                 input_ir,
             )
             partition_info[input_ir] = PartitionInfo(
@@ -359,7 +360,7 @@ def _decompose_agg_node(
         # Chunkwise stage
         columns, input_ir, partition_info = select(
-            [Cast(agg.dtype, agg)],
+            [Cast(agg.dtype, True, agg)],  # noqa: FBT003
             input_ir,
             partition_info,
             names=names,
@@ -453,7 +454,9 @@ def _decompose_expr_node(
     if partition_count == 1 or expr.is_pointwise:
         # Single-partition and pointwise expressions are always supported.
         return expr, input_ir, partition_info
-    elif isinstance(expr, Agg) and expr.name in _SUPPORTED_AGGS:
+    elif isinstance(expr, Len) or (
+        isinstance(expr, Agg) and expr.name in _SUPPORTED_AGGS
+    ):
         # This is a supported Agg expression.
         return _decompose_agg_node(
             expr, input_ir, partition_info, config_options, names=names

cudf_polars/experimental/groupby.py CHANGED Viewed

@@ -249,6 +249,7 @@ def _(
             child.schema,
             ir.keys,
             config_options.executor.shuffle_method,
+            config_options.executor.shuffler_insertion_method,
             child,
         )
         partition_info[child] = PartitionInfo(
@@ -291,6 +292,7 @@ def _(
             gb_pwise.schema,
             grouped_keys,
             config_options.executor.shuffle_method,
+            config_options.executor.shuffler_insertion_method,
             gb_pwise,
         )
         partition_info[gb_inter] = PartitionInfo(count=post_aggregation_count)

cudf_polars/experimental/io.py CHANGED Viewed

@@ -709,6 +709,8 @@ class ParquetSourceInfo(DataSourceInfo):
         # Helper attributes
         self._key_columns: set[str] = set()  # Used to fuse lazy row-group sampling
         self._unique_stats: dict[str, UniqueStats] = {}
+        self._read_columns: set[str] = set()
+        self._real_rg_size: dict[str, int] = {}
     @functools.cached_property
     def metadata(self) -> ParquetMetadata:
@@ -731,11 +733,13 @@ class ParquetSourceInfo(DataSourceInfo):
             return
         column_names = self.metadata.column_names
-        if not (
-            key_columns := [key for key in self._key_columns if key in column_names]
-        ):  # pragma: no cover; should never get here
-            # No key columns found in the file
-            raise ValueError(f"None of {self._key_columns} in {column_names}")
+        key_columns = [key for key in self._key_columns if key in column_names]
+        read_columns = list(
+            self._read_columns.intersection(column_names).union(key_columns)
+        )
+        if not read_columns:  # pragma: no cover; should never get here
+            # No key columns or read columns found in the file
+            raise ValueError(f"None of {read_columns} in {column_names}")
         sampled_file_count = len(sample_paths)
         num_row_groups_per_file = self.metadata.num_row_groups_per_file
@@ -745,15 +749,15 @@ class ParquetSourceInfo(DataSourceInfo):
         ):
             raise ValueError("Parquet metadata sampling failed.")  # pragma: no cover
-        n = 0
+        n_sampled = 0
         samples: defaultdict[str, list[int]] = defaultdict(list)
         for path, num_rgs in zip(sample_paths, num_row_groups_per_file, strict=True):
             for rg_id in range(num_rgs):
-                n += 1
+                n_sampled += 1
                 samples[path].append(rg_id)
-                if n == self.max_row_group_samples:
+                if n_sampled == self.max_row_group_samples:
                     break
-            if n == self.max_row_group_samples:
+            if n_sampled == self.max_row_group_samples:
                 break
         exact = sampled_file_count == len(
@@ -763,7 +767,7 @@ class ParquetSourceInfo(DataSourceInfo):
         options = plc.io.parquet.ParquetReaderOptions.builder(
             plc.io.SourceInfo(list(samples))
         ).build()
-        options.set_columns(key_columns)
+        options.set_columns(read_columns)
         options.set_row_groups(list(samples.values()))
         stream = get_cuda_stream()
         tbl_w_meta = plc.io.parquet.read_parquet(options, stream=stream)
@@ -773,30 +777,32 @@ class ParquetSourceInfo(DataSourceInfo):
             tbl_w_meta.columns,
             strict=True,
         ):
-            row_group_unique_count = plc.stream_compaction.distinct_count(
-                column,
-                plc.types.NullPolicy.INCLUDE,
-                plc.types.NanPolicy.NAN_IS_NULL,
-                stream=stream,
-            )
-            fraction = row_group_unique_count / row_group_num_rows
-            # Assume that if every row is unique then this is a
-            # primary key otherwise it's a foreign key and we
-            # can't use the single row group count estimate.
-            # Example, consider a "foreign" key that has 100
-            # unique values. If we sample from a single row group,
-            # we likely obtain a unique count of 100. But we can't
-            # necessarily deduce that that means that the unique
-            # count is 100 / num_rows_in_group * num_rows_in_file
-            count: int | None = None
-            if exact:
-                count = row_group_unique_count
-            elif row_group_unique_count == row_group_num_rows:
-                count = self.row_count.value
-            self._unique_stats[name] = UniqueStats(
-                ColumnStat[int](value=count, exact=exact),
-                ColumnStat[float](value=fraction, exact=exact),
-            )
+            self._real_rg_size[name] = column.device_buffer_size() // n_sampled
+            if name in key_columns:
+                row_group_unique_count = plc.stream_compaction.distinct_count(
+                    column,
+                    plc.types.NullPolicy.INCLUDE,
+                    plc.types.NanPolicy.NAN_IS_NULL,
+                    stream=stream,
+                )
+                fraction = row_group_unique_count / row_group_num_rows
+                # Assume that if every row is unique then this is a
+                # primary key otherwise it's a foreign key and we
+                # can't use the single row group count estimate.
+                # Example, consider a "foreign" key that has 100
+                # unique values. If we sample from a single row group,
+                # we likely obtain a unique count of 100. But we can't
+                # necessarily deduce that that means that the unique
+                # count is 100 / num_rows_in_group * num_rows_in_file
+                count: int | None = None
+                if exact:
+                    count = row_group_unique_count
+                elif row_group_unique_count == row_group_num_rows:
+                    count = self.row_count.value
+                self._unique_stats[name] = UniqueStats(
+                    ColumnStat[int](value=count, exact=exact),
+                    ColumnStat[float](value=fraction, exact=exact),
+                )
         stream.synchronize()
     def _update_unique_stats(self, column: str) -> None:
@@ -822,6 +828,15 @@ class ParquetSourceInfo(DataSourceInfo):
             # the row count, because dictionary encoding can make the
             # in-memory size much larger.
             min_value = max(1, row_count // file_count)
+            if partial_mean_size < min_value and column not in self._real_rg_size:
+                # If the metadata is suspiciously small,
+                # sample "real" data to get a better estimate.
+                self._sample_row_groups()
+            if column in self._real_rg_size:
+                partial_mean_size = int(
+                    self._real_rg_size[column]
+                    * statistics.mean(self.metadata.num_row_groups_per_file)
+                )
             return ColumnStat[int](max(min_value, partial_mean_size))
         return ColumnStat[int]()
@@ -863,14 +878,19 @@ def _extract_scan_stats(
             config_options.parquet_options.max_row_group_samples,
             config_options.executor.stats_planning,
         )
-        return {
+        cstats = {
             name: ColumnStats(
                 name=name,
                 source_info=ColumnSourceInfo(DataSourcePair(table_source_info, name)),
             )
             for name in ir.schema
         }
+        # Mark all columns that we are reading in case
+        # we need to sample real data later.
+        if config_options.executor.stats_planning.use_sampling:
+            for name, cs in cstats.items():
+                cs.source_info.add_read_column(name)
+        return cstats
     else:
         return {name: ColumnStats(name=name) for name in ir.schema}
@@ -889,10 +909,10 @@ class DataFrameSourceInfo(DataSourceInfo):
     def __init__(
         self,
-        df: Any,
+        df: pl.DataFrame,
         stats_planning: StatsPlanningOptions,
     ):
-        self._df = df
+        self._pdf = df
         self._stats_planning = stats_planning
         self._key_columns: set[str] = set()
         self._unique_stats_columns = set()
@@ -901,17 +921,19 @@ class DataFrameSourceInfo(DataSourceInfo):
     @functools.cached_property
     def row_count(self) -> ColumnStat[int]:
         """Data source row-count estimate."""
-        return ColumnStat[int](value=self._df.height(), exact=True)
+        return ColumnStat[int](value=self._pdf.height, exact=True)
     def _update_unique_stats(self, column: str) -> None:
         if column not in self._unique_stats and self._stats_planning.use_sampling:
             row_count = self.row_count.value
             try:
                 unique_count = (
-                    self._df.get_column(column).approx_n_unique() if row_count else 0
+                    self._pdf._df.get_column(column).approx_n_unique()
+                    if row_count
+                    else 0
                 )
             except pl.exceptions.InvalidOperationError:  # pragma: no cover
-                unique_count = self._df.get_column(column).n_unique()
+                unique_count = self._pdf._df.get_column(column).n_unique()
             unique_fraction = min((unique_count / row_count), 1.0) if row_count else 1.0
             self._unique_stats[column] = UniqueStats(
                 ColumnStat[int](value=unique_count),
@@ -932,7 +954,7 @@ def _extract_dataframescan_stats(
         "Only streaming executor is supported in _extract_dataframescan_stats"
     )
     table_source_info = DataFrameSourceInfo(
-        ir.df,
+        pl.DataFrame._from_pydf(ir.df),
         config_options.executor.stats_planning,
     )
     return {

cudf-polars-cu13 25.12.0__py3-none-any.whl → 26.2.0__py3-none-any.whl

cudf-polars-cu13 25.12.0py3-none-any.whl → 26.2.0py3-none-any.whl