PyPI - cudf-polars-cu13 - Versions diffs - 25.10.0__py3-none-any.whl → 26.2.0__py3-none-any.whl - Mend

cudf-polars-cu13 25.10.0py3-none-any.whl → 26.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

cudf_polars/GIT_COMMIT +1 -1
cudf_polars/VERSION +1 -1
cudf_polars/callback.py +60 -15
cudf_polars/containers/column.py +137 -77
cudf_polars/containers/dataframe.py +123 -34
cudf_polars/containers/datatype.py +134 -13
cudf_polars/dsl/expr.py +0 -2
cudf_polars/dsl/expressions/aggregation.py +80 -28
cudf_polars/dsl/expressions/binaryop.py +34 -14
cudf_polars/dsl/expressions/boolean.py +110 -37
cudf_polars/dsl/expressions/datetime.py +59 -30
cudf_polars/dsl/expressions/literal.py +11 -5
cudf_polars/dsl/expressions/rolling.py +460 -119
cudf_polars/dsl/expressions/selection.py +9 -8
cudf_polars/dsl/expressions/slicing.py +1 -1
cudf_polars/dsl/expressions/string.py +256 -114
cudf_polars/dsl/expressions/struct.py +19 -7
cudf_polars/dsl/expressions/ternary.py +33 -3
cudf_polars/dsl/expressions/unary.py +126 -64
cudf_polars/dsl/ir.py +1053 -350
cudf_polars/dsl/to_ast.py +30 -13
cudf_polars/dsl/tracing.py +194 -0
cudf_polars/dsl/translate.py +307 -107
cudf_polars/dsl/utils/aggregations.py +43 -30
cudf_polars/dsl/utils/reshape.py +14 -2
cudf_polars/dsl/utils/rolling.py +12 -8
cudf_polars/dsl/utils/windows.py +35 -20
cudf_polars/experimental/base.py +55 -2
cudf_polars/experimental/benchmarks/pdsds.py +12 -126
cudf_polars/experimental/benchmarks/pdsh.py +792 -2
cudf_polars/experimental/benchmarks/utils.py +596 -39
cudf_polars/experimental/dask_registers.py +47 -20
cudf_polars/experimental/dispatch.py +9 -3
cudf_polars/experimental/distinct.py +2 -0
cudf_polars/experimental/explain.py +15 -2
cudf_polars/experimental/expressions.py +30 -15
cudf_polars/experimental/groupby.py +25 -4
cudf_polars/experimental/io.py +156 -124
cudf_polars/experimental/join.py +53 -23
cudf_polars/experimental/parallel.py +68 -19
cudf_polars/experimental/rapidsmpf/__init__.py +8 -0
cudf_polars/experimental/rapidsmpf/collectives/__init__.py +9 -0
cudf_polars/experimental/rapidsmpf/collectives/allgather.py +90 -0
cudf_polars/experimental/rapidsmpf/collectives/common.py +96 -0
cudf_polars/experimental/rapidsmpf/collectives/shuffle.py +253 -0
cudf_polars/experimental/rapidsmpf/core.py +488 -0
cudf_polars/experimental/rapidsmpf/dask.py +172 -0
cudf_polars/experimental/rapidsmpf/dispatch.py +153 -0
cudf_polars/experimental/rapidsmpf/io.py +696 -0
cudf_polars/experimental/rapidsmpf/join.py +322 -0
cudf_polars/experimental/rapidsmpf/lower.py +74 -0
cudf_polars/experimental/rapidsmpf/nodes.py +735 -0
cudf_polars/experimental/rapidsmpf/repartition.py +216 -0
cudf_polars/experimental/rapidsmpf/union.py +115 -0
cudf_polars/experimental/rapidsmpf/utils.py +374 -0
cudf_polars/experimental/repartition.py +9 -2
cudf_polars/experimental/select.py +177 -14
cudf_polars/experimental/shuffle.py +46 -12
cudf_polars/experimental/sort.py +100 -26
cudf_polars/experimental/spilling.py +1 -1
cudf_polars/experimental/statistics.py +24 -5
cudf_polars/experimental/utils.py +25 -7
cudf_polars/testing/asserts.py +13 -8
cudf_polars/testing/io.py +2 -1
cudf_polars/testing/plugin.py +93 -17
cudf_polars/typing/__init__.py +86 -32
cudf_polars/utils/config.py +473 -58
cudf_polars/utils/cuda_stream.py +70 -0
cudf_polars/utils/versions.py +5 -4
cudf_polars_cu13-26.2.0.dist-info/METADATA +181 -0
cudf_polars_cu13-26.2.0.dist-info/RECORD +108 -0
{cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/WHEEL +1 -1
cudf_polars_cu13-25.10.0.dist-info/METADATA +0 -136
cudf_polars_cu13-25.10.0.dist-info/RECORD +0 -92
{cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/licenses/LICENSE +0 -0
{cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/top_level.txt +0 -0

cudf_polars/experimental/benchmarks/utils.py CHANGED Viewed

@@ -8,27 +8,43 @@ from __future__ import annotations
 import argparse
 import dataclasses
 import importlib
+import io
+import itertools
 import json
+import logging
 import os
 import statistics
 import sys
 import textwrap
 import time
 import traceback
+import warnings
 from collections import defaultdict
 from datetime import datetime, timezone
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Literal, assert_never
 import nvtx
 import polars as pl
+import rmm.statistics
+try:
+    import duckdb
+    duckdb_err = None
+except ImportError as e:
+    duckdb = None
+    duckdb_err = e
 try:
     import pynvml
 except ImportError:
     pynvml = None
 try:
+    from cudf_polars.dsl.ir import IRExecutionContext
     from cudf_polars.dsl.translate import Translator
     from cudf_polars.experimental.explain import explain_query
     from cudf_polars.experimental.parallel import evaluate_streaming
@@ -41,7 +57,17 @@ except ImportError:
 if TYPE_CHECKING:
     from collections.abc import Callable, Sequence
-    from pathlib import Path
+try:
+    import structlog
+    import structlog.contextvars
+    import structlog.processors
+    import structlog.stdlib
+except ImportError:
+    _HAS_STRUCTLOG = False
+else:
+    _HAS_STRUCTLOG = True
 ExecutorType = Literal["in-memory", "streaming", "cpu"]
@@ -52,8 +78,28 @@ class Record:
     """Results for a single run of a single PDS-H query."""
     query: int
+    iteration: int
     duration: float
     shuffle_stats: dict[str, dict[str, int | float]] | None = None
+    traces: list[dict[str, Any]] | None = None
+    @classmethod
+    def new(
+        cls,
+        query: int,
+        iteration: int,
+        duration: float,
+        shuffle_stats: dict[str, dict[str, int | float]] | None = None,
+        traces: list[dict[str, Any]] | None = None,
+    ) -> Record:
+        """Create a Record from plain data."""
+        return cls(
+            query=query,
+            iteration=iteration,
+            duration=duration,
+            shuffle_stats=shuffle_stats,
+            traces=traces,
+        )
 @dataclasses.dataclass
@@ -181,7 +227,10 @@ class RunConfig:
     queries: list[int]
     suffix: str
     executor: ExecutorType
-    scheduler: str
+    runtime: str
+    stream_policy: str | None
+    cluster: str
+    scheduler: str  # Deprecated, kept for backward compatibility
     n_workers: int
     versions: PackageVersions = dataclasses.field(
         default_factory=PackageVersions.collect
@@ -205,7 +254,10 @@ class RunConfig:
     rapidsmpf_spill: bool
     spill_device: float
     query_set: str
+    collect_traces: bool = False
     stats_planning: bool
+    max_io_threads: int
+    native_parquet: bool
     def __post_init__(self) -> None:  # noqa: D105
         if self.gather_shuffle_stats and self.shuffle != "rapidsmpf":
@@ -217,10 +269,38 @@ class RunConfig:
     def from_args(cls, args: argparse.Namespace) -> RunConfig:
         """Create a RunConfig from command line arguments."""
         executor: ExecutorType = args.executor
+        cluster = args.cluster
         scheduler = args.scheduler
+        runtime = args.runtime
+        stream_policy = args.stream_policy
+        # Handle "auto" stream policy
+        if stream_policy == "auto":
+            stream_policy = None
+        # Deal with deprecated scheduler argument
+        # and non-streaming executors
         if executor == "in-memory" or executor == "cpu":
+            cluster = None
             scheduler = None
+        elif scheduler is not None:
+            if cluster is not None:
+                raise ValueError(
+                    "Cannot specify both -s/--scheduler and -c/--cluster. "
+                    "Please use -c/--cluster only."
+                )
+            else:
+                warnings.warn(
+                    "The -s/--scheduler argument is deprecated. Use -c/--cluster instead.",
+                    FutureWarning,
+                    stacklevel=2,
+                )
+            cluster = "single" if scheduler == "synchronous" else "distributed"
+        elif cluster is not None:
+            scheduler = "synchronous" if cluster == "single" else "distributed"
+        else:
+            cluster = "single"
+            scheduler = "synchronous"
         path = args.path
         name = args.query_set
@@ -240,12 +320,25 @@ class RunConfig:
             scale_factor = _infer_scale_factor(name, path, args.suffix)
         if path is None:
             path = f"{args.root}/scale-{scale_factor}"
+        scale_factor = float(scale_factor)
         try:
-            scale_factor = int(scale_factor)
+            scale_factor_int = int(scale_factor)
         except ValueError:
-            scale_factor = float(scale_factor)
-        if "pdsh" in name and args.scale is not None:
+            pass
+        else:
+            if scale_factor_int == scale_factor:
+                scale_factor = scale_factor_int
+        skip_scale_factor_inference = (
+            "LIBCUDF_IO_REROUTE_LOCAL_DIR_PATTERN" in os.environ
+        ) and ("LIBCUDF_IO_REROUTE_REMOTE_DIR_PATTERN" in os.environ)
+        if (
+            "pdsh" in name
+            and args.scale is not None
+            and skip_scale_factor_inference is False
+        ):
             # Validate the user-supplied scale factor
             sf_inf = _infer_scale_factor(name, path, args.suffix)
             rel_error = abs((scale_factor - sf_inf) / sf_inf)
@@ -258,7 +351,10 @@ class RunConfig:
         return cls(
             queries=args.query,
             executor=executor,
+            cluster=cluster,
             scheduler=scheduler,
+            runtime=runtime,
+            stream_policy=stream_policy,
             n_workers=args.n_workers,
             shuffle=args.shuffle,
             gather_shuffle_stats=args.rapidsmpf_dask_statistics,
@@ -275,7 +371,10 @@ class RunConfig:
             rapidsmpf_spill=args.rapidsmpf_spill,
             max_rows_per_partition=args.max_rows_per_partition,
             query_set=args.query_set,
+            collect_traces=args.collect_traces,
             stats_planning=args.stats_planning,
+            max_io_threads=args.max_io_threads,
+            native_parquet=args.native_parquet,
         )
     def serialize(self, engine: pl.GPUEngine | None) -> dict:
@@ -297,13 +396,17 @@ class RunConfig:
             print(f"path: {self.dataset_path}")
             print(f"scale_factor: {self.scale_factor}")
             print(f"executor: {self.executor}")
+            print(f"stream_policy: {self.stream_policy}")
             if self.executor == "streaming":
-                print(f"scheduler: {self.scheduler}")
+                print(f"runtime: {self.runtime}")
+                print(f"cluster: {self.cluster}")
                 print(f"blocksize: {self.blocksize}")
                 print(f"shuffle_method: {self.shuffle}")
                 print(f"broadcast_join_limit: {self.broadcast_join_limit}")
                 print(f"stats_planning: {self.stats_planning}")
-                if self.scheduler == "distributed":
+                if self.runtime == "rapidsmpf":
+                    print(f"native_parquet: {self.native_parquet}")
+                if self.cluster == "distributed":
                     print(f"n_workers: {self.n_workers}")
                     print(f"threads: {self.threads}")
                     print(f"rmm_async: {self.rmm_async}")
@@ -338,20 +441,31 @@ def get_executor_options(
     """Generate executor_options for GPUEngine."""
     executor_options: dict[str, Any] = {}
-    if run_config.blocksize:
-        executor_options["target_partition_size"] = run_config.blocksize
-    if run_config.max_rows_per_partition:
-        executor_options["max_rows_per_partition"] = run_config.max_rows_per_partition
-    if run_config.shuffle:
-        executor_options["shuffle_method"] = run_config.shuffle
-    if run_config.broadcast_join_limit:
-        executor_options["broadcast_join_limit"] = run_config.broadcast_join_limit
-    if run_config.rapidsmpf_spill:
-        executor_options["rapidsmpf_spill"] = run_config.rapidsmpf_spill
-    if run_config.scheduler == "distributed":
-        executor_options["scheduler"] = "distributed"
-    if run_config.stats_planning:
-        executor_options["stats_planning"] = {"use_reduction_planning": True}
+    if run_config.executor == "streaming":
+        if run_config.blocksize:
+            executor_options["target_partition_size"] = run_config.blocksize
+        if run_config.max_rows_per_partition:
+            executor_options["max_rows_per_partition"] = (
+                run_config.max_rows_per_partition
+            )
+        if run_config.shuffle:
+            executor_options["shuffle_method"] = run_config.shuffle
+        if run_config.broadcast_join_limit:
+            executor_options["broadcast_join_limit"] = run_config.broadcast_join_limit
+        if run_config.rapidsmpf_spill:
+            executor_options["rapidsmpf_spill"] = run_config.rapidsmpf_spill
+        if run_config.cluster == "distributed":
+            executor_options["cluster"] = "distributed"
+        executor_options["stats_planning"] = {
+            "use_reduction_planning": run_config.stats_planning,
+            "use_sampling": (
+                # Always allow row-group sampling for rapidsmpf runtime
+                run_config.stats_planning or run_config.runtime == "rapidsmpf"
+            ),
+        }
+        executor_options["client_device_threshold"] = run_config.spill_device
+        executor_options["runtime"] = run_config.runtime
+        executor_options["max_io_threads"] = run_config.max_io_threads
     if (
         benchmark
@@ -390,7 +504,7 @@ def print_query_plan(
         if args.explain_logical:
             print(f"\nQuery {q_id} - Logical plan\n")
             print(explain_query(q, engine, physical=False))
-        if args.explain:
+        if args.explain and run_config.executor == "streaming":
             print(f"\nQuery {q_id} - Physical plan\n")
             print(explain_query(q, engine))
     else:
@@ -399,9 +513,9 @@ def print_query_plan(
         )
-def initialize_dask_cluster(run_config: RunConfig, args: argparse.Namespace):  # type: ignore
+def initialize_dask_cluster(run_config: RunConfig, args: argparse.Namespace):  # type: ignore[no-untyped-def]
     """Initialize a Dask distributed cluster."""
-    if run_config.scheduler != "distributed":
+    if run_config.cluster != "distributed":
         return None
     from dask_cuda import LocalCUDACluster
@@ -437,6 +551,10 @@ def initialize_dask_cluster(run_config: RunConfig, args: argparse.Namespace):  #
                     }
                 ),
             )
+            # Setting this globally makes the peak statistics not meaningful
+            # across queries / iterations. But doing it per query isn't worth
+            # the effort right now.
+            client.run(rmm.statistics.enable_statistics)
         except ImportError as err:
             if run_config.shuffle == "rapidsmpf":
                 raise ImportError(
@@ -468,10 +586,18 @@ def execute_query(
             if args.debug:
                 translator = Translator(q._ldf.visit(), engine)
                 ir = translator.translate_ir()
+                context = IRExecutionContext.from_config_options(
+                    translator.config_options
+                )
                 if run_config.executor == "in-memory":
-                    return ir.evaluate(cache={}, timer=None).to_polars()
+                    return ir.evaluate(
+                        cache={}, timer=None, context=context
+                    ).to_polars()
                 elif run_config.executor == "streaming":
-                    return evaluate_streaming(ir, translator.config_options).to_polars()
+                    return evaluate_streaming(
+                        ir,
+                        translator.config_options,
+                    )
                 assert_never(run_config.executor)
             else:
                 return q.collect(engine=engine)
@@ -558,22 +684,51 @@ def parse_args(
                 - streaming : Partitioned evaluation (default)
                 - cpu       : Use Polars CPU engine"""),
     )
+    parser.add_argument(
+        "-c",
+        "--cluster",
+        default=None,
+        type=str,
+        choices=["single", "distributed"],
+        help=textwrap.dedent("""\
+            Cluster type to use with the 'streaming' executor.
+                - single      : Run locally in a single process
+                - distributed : Use Dask for multi-GPU execution"""),
+    )
     parser.add_argument(
         "-s",
         "--scheduler",
-        default="synchronous",
+        default=None,
         type=str,
         choices=["synchronous", "distributed"],
         help=textwrap.dedent("""\
+            *Deprecated*: Use --cluster instead.
             Scheduler type to use with the 'streaming' executor.
                 - synchronous : Run locally in a single process
                 - distributed : Use Dask for multi-GPU execution"""),
     )
+    parser.add_argument(
+        "--runtime",
+        type=str,
+        choices=["tasks", "rapidsmpf"],
+        default="tasks",
+        help="Runtime to use for the streaming executor (tasks or rapidsmpf).",
+    )
+    parser.add_argument(
+        "--stream-policy",
+        type=str,
+        choices=["auto", "default", "new", "pool"],
+        default="auto",
+        help=textwrap.dedent("""\
+            CUDA stream policy (auto, default, new, pool).
+            Default: auto (use the default policy for the runtime)"""),
+    )
     parser.add_argument(
         "--n-workers",
         default=1,
         type=int,
-        help="Number of Dask-CUDA workers (requires 'distributed' scheduler).",
+        help="Number of Dask-CUDA workers (requires 'distributed' cluster).",
     )
     parser.add_argument(
         "--blocksize",
@@ -627,11 +782,12 @@ def parse_args(
     )
     parser.add_argument(
         "--rmm-pool-size",
-        default=0.5,
+        default=None,
         type=float,
         help=textwrap.dedent("""\
             Fraction of total GPU memory to allocate for RMM pool.
-            Default: 0.5 (50%% of GPU memory)"""),
+            Default: 0.5 (50%% of GPU memory) when --no-rmm-async,
+                     None when --rmm-async"""),
     )
     parser.add_argument(
         "--rmm-release-threshold",
@@ -646,7 +802,7 @@ def parse_args(
         "--rmm-async",
         action=argparse.BooleanOptionalAction,
         default=False,
-        help="Use RMM async memory resource.",
+        help="Use RMM async memory resource. Note: only affects distributed cluster!",
     )
     parser.add_argument(
         "--rapidsmpf-oom-protection",
@@ -721,13 +877,40 @@ def parse_args(
         default="duckdb",
         help="Which engine to use as the baseline for validation.",
     )
+    parser.add_argument(
+        "--collect-traces",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+        help="Collect data tracing cudf-polars execution.",
+    )
     parser.add_argument(
         "--stats-planning",
         action=argparse.BooleanOptionalAction,
         default=False,
         help="Enable statistics planning.",
     )
-    return parser.parse_args(args)
+    parser.add_argument(
+        "--max-io-threads",
+        default=2,
+        type=int,
+        help="Maximum number of IO threads for rapidsmpf runtime.",
+    )
+    parser.add_argument(
+        "--native-parquet",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help="Use C++ read_parquet nodes for the rapidsmpf runtime.",
+    )
+    parsed_args = parser.parse_args(args)
+    if parsed_args.rmm_pool_size is None and not parsed_args.rmm_async:
+        # The default rmm pool size depends on the rmm_async flag
+        parsed_args.rmm_pool_size = 0.5
+    return parsed_args
 def run_polars(
@@ -742,17 +925,28 @@ def run_polars(
     validation_failures: list[int] = []
     query_failures: list[tuple[int, int]] = []
-    client = initialize_dask_cluster(run_config, args)  # type: ignore
+    client = initialize_dask_cluster(run_config, args)
     records: defaultdict[int, list[Record]] = defaultdict(list)
     engine: pl.GPUEngine | None = None
     if run_config.executor != "cpu":
         executor_options = get_executor_options(run_config, benchmark=benchmark)
+        if run_config.runtime == "rapidsmpf":
+            parquet_options = {
+                "use_rapidsmpf_native": run_config.native_parquet,
+            }
+        else:
+            parquet_options = {}
         engine = pl.GPUEngine(
             raise_on_fail=True,
+            memory_resource=rmm.mr.CudaAsyncMemoryResource()
+            if run_config.rmm_async
+            else None,
+            cuda_stream_policy=run_config.stream_policy,
             executor=run_config.executor,
             executor_options=executor_options,
+            parquet_options=parquet_options,
         )
     for q_id in run_config.queries:
@@ -764,8 +958,12 @@ def run_polars(
         print_query_plan(q_id, q, args, run_config, engine)
         records[q_id] = []
         for i in range(args.iterations):
+            if _HAS_STRUCTLOG and run_config.collect_traces:
+                setup_logging(q_id, i)
+                if client is not None:
+                    client.run(setup_logging, q_id, i)
             t0 = time.monotonic()
             try:
@@ -781,8 +979,8 @@ def run_polars(
                     gather_shuffle_statistics,
                 )
-                shuffle_stats = gather_shuffle_statistics(client)  # type: ignore[arg-type]
-                clear_shuffle_statistics(client)  # type: ignore[arg-type]
+                shuffle_stats = gather_shuffle_statistics(client)
+                clear_shuffle_statistics(client)
             else:
                 shuffle_stats = None
@@ -800,15 +998,65 @@ def run_polars(
                     print(f"❌ Query {q_id} failed validation!\n{e}")
             t1 = time.monotonic()
-            record = Record(query=q_id, duration=t1 - t0, shuffle_stats=shuffle_stats)
+            record = Record(
+                query=q_id, iteration=i, duration=t1 - t0, shuffle_stats=shuffle_stats
+            )
             if args.print_results:
                 print(result)
-            print(f"Query {q_id} - Iteration {i} finished in {record.duration:0.4f}s")
+            print(
+                f"Query {q_id} - Iteration {i} finished in {record.duration:0.4f}s",
+                flush=True,
+            )
             records[q_id].append(record)
     run_config = dataclasses.replace(run_config, records=dict(records))
+    # consolidate logs
+    if _HAS_STRUCTLOG and run_config.collect_traces:
+        def gather_logs() -> str:
+            logger = logging.getLogger()
+            return logger.handlers[0].stream.getvalue()  # type: ignore[attr-defined]
+        if client is not None:
+            all_logs = "\n".join(client.run(gather_logs).values())
+        else:
+            all_logs = gather_logs()
+        parsed_logs = [json.loads(log) for log in all_logs.splitlines() if log]
+        # Some other log records can end up in here. Filter those out.
+        parsed_logs = [log for log in parsed_logs if log["event"] == "Execute IR"]
+        # Now we want to augment the existing Records with the trace data.
+        def group_key(x: dict) -> int:
+            return x["query_id"]
+        def sort_key(x: dict) -> tuple[int, int]:
+            return x["query_id"], x["iteration"]
+        grouped = itertools.groupby(
+            sorted(parsed_logs, key=sort_key),
+            key=group_key,
+        )
+        for query_id, run_logs_group in grouped:
+            run_logs = list(run_logs_group)
+            by_iteration = [
+                list(x)
+                for _, x in itertools.groupby(run_logs, key=lambda x: x["iteration"])
+            ]
+            run_records = run_config.records[query_id]
+            assert len(by_iteration) == len(run_records)  # same number of iterations
+            all_traces = [list(iteration) for iteration in by_iteration]
+            new_records = [
+                dataclasses.replace(record, traces=traces)
+                for record, traces in zip(run_records, all_traces, strict=True)
+            ]
+            run_config.records[query_id] = new_records
     if args.summarize:
         run_config.summarize()
@@ -830,3 +1078,312 @@ def run_polars(
     if query_failures or validation_failures:
         sys.exit(1)
+def setup_logging(query_id: int, iteration: int) -> None:  # noqa: D103
+    import cudf_polars.dsl.tracing
+    if not cudf_polars.dsl.tracing.LOG_TRACES:
+        msg = (
+            "Tracing requested via --collect-traces, but tracking is not enabled. "
+            "Verify that 'CUDF_POLARS_LOG_TRACES' is set and structlog is installed."
+        )
+        raise RuntimeError(msg)
+    if _HAS_STRUCTLOG:
+        # structlog uses contextvars to propagate context down to where log records
+        # are emitted. Ideally, we'd just set the contextvars here using
+        # structlog.bind_contextvars; for the distributed cluster we would need
+        # to use something like client.run to set the contextvars on the worker.
+        # However, there's an unfortunate conflict between structlog's use of
+        # context vars and how Dask Workers actually execute tasks, such that
+        # the contextvars set via `client.run` aren't visible to the actual
+        # tasks.
+        #
+        # So instead we make a new logger each time we need a new context,
+        # i.e. for each query/iteration pair.
+        def make_injector(
+            query_id: int, iteration: int
+        ) -> Callable[[logging.Logger, str, dict[str, Any]], dict[str, Any]]:
+            def inject(
+                logger: Any, method_name: Any, event_dict: Any
+            ) -> dict[str, Any]:
+                event_dict["query_id"] = query_id
+                event_dict["iteration"] = iteration
+                return event_dict
+            return inject
+        shared_processors = [
+            structlog.contextvars.merge_contextvars,
+            make_injector(query_id, iteration),
+            structlog.processors.add_log_level,
+            structlog.processors.CallsiteParameterAdder(
+                parameters=[
+                    structlog.processors.CallsiteParameter.PROCESS,
+                    structlog.processors.CallsiteParameter.THREAD,
+                ],
+            ),
+            structlog.processors.StackInfoRenderer(),
+            structlog.dev.set_exc_info,
+            structlog.processors.TimeStamper(fmt="%Y-%m-%d %H:%M:%S.%f", utc=False),
+        ]
+        # For logging to a file
+        json_renderer = structlog.processors.JSONRenderer()
+        stream = io.StringIO()
+        json_file_handler = logging.StreamHandler(stream)
+        json_file_handler.setFormatter(
+            structlog.stdlib.ProcessorFormatter(
+                processor=json_renderer,
+                foreign_pre_chain=shared_processors,
+            )
+        )
+        logging.basicConfig(level=logging.INFO, handlers=[json_file_handler])
+        structlog.configure(
+            processors=[
+                *shared_processors,
+                structlog.stdlib.ProcessorFormatter.wrap_for_formatter,
+            ],
+            logger_factory=structlog.stdlib.LoggerFactory(),
+            wrapper_class=structlog.make_filtering_bound_logger(logging.INFO),
+            cache_logger_on_first_use=True,
+        )
+PDSDS_TABLE_NAMES: list[str] = [
+    "call_center",
+    "catalog_page",
+    "catalog_returns",
+    "catalog_sales",
+    "customer",
+    "customer_address",
+    "customer_demographics",
+    "date_dim",
+    "household_demographics",
+    "income_band",
+    "inventory",
+    "item",
+    "promotion",
+    "reason",
+    "ship_mode",
+    "store",
+    "store_returns",
+    "store_sales",
+    "time_dim",
+    "warehouse",
+    "web_page",
+    "web_returns",
+    "web_sales",
+    "web_site",
+]
+PDSH_TABLE_NAMES: list[str] = [
+    "customer",
+    "lineitem",
+    "nation",
+    "orders",
+    "part",
+    "partsupp",
+    "region",
+    "supplier",
+]
+def print_duckdb_plan(
+    q_id: int,
+    sql: str,
+    dataset_path: Path,
+    suffix: str,
+    query_set: str,
+    args: argparse.Namespace,
+) -> None:
+    """Print DuckDB query plan using EXPLAIN."""
+    if duckdb is None:
+        raise ImportError(duckdb_err)
+    if query_set == "pdsds":
+        tbl_names = PDSDS_TABLE_NAMES
+    else:
+        tbl_names = PDSH_TABLE_NAMES
+    with duckdb.connect() as conn:
+        for name in tbl_names:
+            pattern = (Path(dataset_path) / name).as_posix() + suffix
+            conn.execute(
+                f"CREATE OR REPLACE VIEW {name} AS "
+                f"SELECT * FROM parquet_scan('{pattern}');"
+            )
+        if args.explain_logical and args.explain:
+            conn.execute("PRAGMA explain_output = 'all';")
+        elif args.explain_logical:
+            conn.execute("PRAGMA explain_output = 'optimized_only';")
+        else:
+            conn.execute("PRAGMA explain_output = 'physical_only';")
+        print(f"\nDuckDB Query {q_id} - Plan\n")
+        plan_rows = conn.execute(f"EXPLAIN {sql}").fetchall()
+        for _, line in plan_rows:
+            print(line)
+def execute_duckdb_query(
+    query: str,
+    dataset_path: Path,
+    *,
+    suffix: str = ".parquet",
+    query_set: str = "pdsh",
+) -> pl.DataFrame:
+    """Execute a query with DuckDB."""
+    if duckdb is None:
+        raise ImportError(duckdb_err)
+    if query_set == "pdsds":
+        tbl_names = PDSDS_TABLE_NAMES
+    else:
+        tbl_names = PDSH_TABLE_NAMES
+    with duckdb.connect() as conn:
+        for name in tbl_names:
+            pattern = (Path(dataset_path) / name).as_posix() + suffix
+            conn.execute(
+                f"CREATE OR REPLACE VIEW {name} AS "
+                f"SELECT * FROM parquet_scan('{pattern}');"
+            )
+        return conn.execute(query).pl()
+def run_duckdb(
+    duckdb_queries_cls: Any, options: Sequence[str] | None = None, *, num_queries: int
+) -> None:
+    """Run the benchmark with DuckDB."""
+    args = parse_args(options, num_queries=num_queries)
+    vars(args).update({"query_set": duckdb_queries_cls.name})
+    run_config = RunConfig.from_args(args)
+    records: defaultdict[int, list[Record]] = defaultdict(list)
+    for q_id in run_config.queries:
+        try:
+            get_q = getattr(duckdb_queries_cls, f"q{q_id}")
+        except AttributeError as err:
+            raise NotImplementedError(f"Query {q_id} not implemented.") from err
+        sql = get_q(run_config)
+        if args.explain or args.explain_logical:
+            print_duckdb_plan(
+                q_id=q_id,
+                sql=sql,
+                dataset_path=run_config.dataset_path,
+                suffix=run_config.suffix,
+                query_set=duckdb_queries_cls.name,
+                args=args,
+            )
+        print(f"DuckDB Executing: {q_id}")
+        records[q_id] = []
+        for i in range(args.iterations):
+            t0 = time.time()
+            result = execute_duckdb_query(
+                sql,
+                run_config.dataset_path,
+                suffix=run_config.suffix,
+                query_set=duckdb_queries_cls.name,
+            )
+            t1 = time.time()
+            record = Record(query=q_id, iteration=i, duration=t1 - t0)
+            if args.print_results:
+                print(result)
+            print(f"Query {q_id} - Iteration {i} finished in {record.duration:0.4f}s")
+            records[q_id].append(record)
+    run_config = dataclasses.replace(run_config, records=dict(records))
+    if args.summarize:
+        run_config.summarize()
+def run_validate(
+    polars_queries_cls: Any,
+    duckdb_queries_cls: Any,
+    options: Sequence[str] | None = None,
+    *,
+    num_queries: int,
+    check_dtypes: bool,
+    check_column_order: bool,
+) -> None:
+    """Validate Polars CPU/GPU vs DuckDB."""
+    from polars.testing import assert_frame_equal
+    args = parse_args(options, num_queries=num_queries)
+    vars(args).update({"query_set": polars_queries_cls.name})
+    run_config = RunConfig.from_args(args)
+    baseline = args.baseline
+    if baseline not in {"duckdb", "cpu"}:
+        raise ValueError("Baseline must be one of: 'duckdb', 'cpu'")
+    failures: list[int] = []
+    engine: pl.GPUEngine | None = None
+    if run_config.executor != "cpu":
+        engine = pl.GPUEngine(
+            raise_on_fail=True,
+            executor=run_config.executor,
+            executor_options=get_executor_options(run_config, polars_queries_cls),
+        )
+    for q_id in run_config.queries:
+        print(f"\nValidating Query {q_id}")
+        try:
+            get_pl = getattr(polars_queries_cls, f"q{q_id}")
+            get_ddb = getattr(duckdb_queries_cls, f"q{q_id}")
+        except AttributeError as err:
+            raise NotImplementedError(f"Query {q_id} not implemented.") from err
+        polars_query = get_pl(run_config)
+        if baseline == "duckdb":
+            base_sql = get_ddb(run_config)
+            base_result = execute_duckdb_query(
+                base_sql,
+                run_config.dataset_path,
+                query_set=duckdb_queries_cls.name,
+            )
+        else:
+            base_result = polars_query.collect(engine="streaming")
+        if run_config.executor == "cpu":
+            test_result = polars_query.collect(engine="streaming")
+        else:
+            try:
+                test_result = polars_query.collect(engine=engine)
+            except Exception as e:
+                failures.append(q_id)
+                print(f"❌ Query {q_id} failed validation: GPU execution failed.\n{e}")
+                continue
+        try:
+            assert_frame_equal(
+                base_result,
+                test_result,
+                check_dtypes=check_dtypes,
+                check_column_order=check_column_order,
+            )
+            print(f"✅ Query {q_id} passed validation.")
+        except AssertionError as e:
+            failures.append(q_id)
+            print(f"❌ Query {q_id} failed validation:\n{e}")
+            if args.print_results:
+                print("Baseline Result:\n", base_result)
+                print("Test Result:\n", test_result)
+    if failures:
+        print("\nValidation Summary:")
+        print("===================")
+        print(f"{len(failures)} query(s) failed: {failures}")
+    else:
+        print("\nAll queries passed validation.")

cudf-polars-cu13 25.10.0__py3-none-any.whl → 26.2.0__py3-none-any.whl

cudf-polars-cu13 25.10.0py3-none-any.whl → 26.2.0py3-none-any.whl