PyPI - statgpu - Versions diffs - 0.1.0__py3-none-any.whl - Mend

statgpu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (168) hide show

statgpu/__init__.py +174 -0
statgpu/_base.py +544 -0
statgpu/_config.py +127 -0
statgpu/anova/__init__.py +5 -0
statgpu/anova/_oneway.py +194 -0
statgpu/backends/__init__.py +83 -0
statgpu/backends/_array_ops.py +529 -0
statgpu/backends/_base.py +184 -0
statgpu/backends/_cupy.py +453 -0
statgpu/backends/_factory.py +65 -0
statgpu/backends/_gpu_inference_cupy.py +214 -0
statgpu/backends/_gpu_inference_torch.py +422 -0
statgpu/backends/_numpy.py +324 -0
statgpu/backends/_torch.py +685 -0
statgpu/backends/_torch_safe.py +47 -0
statgpu/backends/_utils.py +423 -0
statgpu/core/__init__.py +10 -0
statgpu/core/formula/__init__.py +33 -0
statgpu/core/formula/_design.py +99 -0
statgpu/core/formula/_parser.py +191 -0
statgpu/core/formula/_terms.py +70 -0
statgpu/core/formula/tests/__init__.py +0 -0
statgpu/core/formula/tests/test_parser.py +194 -0
statgpu/covariance/__init__.py +6 -0
statgpu/covariance/_empirical.py +310 -0
statgpu/covariance/_shrinkage.py +248 -0
statgpu/cross_validation/__init__.py +31 -0
statgpu/cross_validation/_base.py +410 -0
statgpu/cross_validation/_engine.py +167 -0
statgpu/diagnostics/__init__.py +7 -0
statgpu/diagnostics/_regression_diagnostics.py +188 -0
statgpu/feature_selection/__init__.py +24 -0
statgpu/feature_selection/_knockoff.py +870 -0
statgpu/feature_selection/_knockoff_utils.py +1003 -0
statgpu/feature_selection/_stepwise.py +300 -0
statgpu/glm_core/__init__.py +81 -0
statgpu/glm_core/_base.py +202 -0
statgpu/glm_core/_family.py +362 -0
statgpu/glm_core/_fused.py +149 -0
statgpu/glm_core/_gamma.py +111 -0
statgpu/glm_core/_inverse_gaussian.py +62 -0
statgpu/glm_core/_irls.py +561 -0
statgpu/glm_core/_logistic.py +82 -0
statgpu/glm_core/_negative_binomial.py +68 -0
statgpu/glm_core/_poisson.py +60 -0
statgpu/glm_core/_solver_legacy.py +100 -0
statgpu/glm_core/_squared.py +53 -0
statgpu/glm_core/_tweedie.py +74 -0
statgpu/inference/__init__.py +239 -0
statgpu/inference/_distributions_backend.py +2610 -0
statgpu/inference/_multiple_testing.py +391 -0
statgpu/inference/_resampling.py +1400 -0
statgpu/inference/_results.py +265 -0
statgpu/linear_model/__init__.py +75 -0
statgpu/linear_model/_gaussian_inference.py +306 -0
statgpu/linear_model/_glm_base.py +1261 -0
statgpu/linear_model/_ordered_logit.py +52 -0
statgpu/linear_model/_ordered_probit.py +50 -0
statgpu/linear_model/_stats.py +170 -0
statgpu/linear_model/cv/__init__.py +13 -0
statgpu/linear_model/cv/_elasticnet_cv.py +892 -0
statgpu/linear_model/cv/_lasso_cv.py +253 -0
statgpu/linear_model/cv/_logistic_cv.py +895 -0
statgpu/linear_model/cv/_ridge_cv.py +1160 -0
statgpu/linear_model/legacy/__init__.py +1 -0
statgpu/linear_model/legacy/_distributions_legacy_gpu.py +340 -0
statgpu/linear_model/legacy/_elasticnet_legacy.py +936 -0
statgpu/linear_model/legacy/_lasso_legacy.py +4876 -0
statgpu/linear_model/legacy/_penalized_legacy.py +1174 -0
statgpu/linear_model/legacy/_ridge_legacy.py +863 -0
statgpu/linear_model/legacy/_solver_legacy.py +104 -0
statgpu/linear_model/penalized/__init__.py +25 -0
statgpu/linear_model/penalized/_base.py +437 -0
statgpu/linear_model/penalized/_fit_mixin.py +1877 -0
statgpu/linear_model/penalized/_inference_mixin.py +1179 -0
statgpu/linear_model/penalized/_penalized_cv.py +2699 -0
statgpu/linear_model/penalized/_penalized_gamma.py +86 -0
statgpu/linear_model/penalized/_penalized_inverse_gaussian.py +62 -0
statgpu/linear_model/penalized/_penalized_linear.py +236 -0
statgpu/linear_model/penalized/_penalized_logistic.py +100 -0
statgpu/linear_model/penalized/_penalized_negative_binomial.py +65 -0
statgpu/linear_model/penalized/_penalized_poisson.py +62 -0
statgpu/linear_model/penalized/_penalized_tweedie.py +65 -0
statgpu/linear_model/penalized/_predict_mixin.py +182 -0
statgpu/linear_model/wrappers/__init__.py +31 -0
statgpu/linear_model/wrappers/_adaptive_lasso.py +63 -0
statgpu/linear_model/wrappers/_elasticnet.py +75 -0
statgpu/linear_model/wrappers/_gamma.py +67 -0
statgpu/linear_model/wrappers/_inverse_gaussian.py +47 -0
statgpu/linear_model/wrappers/_lasso.py +2124 -0
statgpu/linear_model/wrappers/_linear.py +1127 -0
statgpu/linear_model/wrappers/_logistic.py +1435 -0
statgpu/linear_model/wrappers/_mcp.py +58 -0
statgpu/linear_model/wrappers/_negative_binomial.py +58 -0
statgpu/linear_model/wrappers/_poisson.py +48 -0
statgpu/linear_model/wrappers/_ridge.py +166 -0
statgpu/linear_model/wrappers/_scad.py +58 -0
statgpu/linear_model/wrappers/_tweedie.py +57 -0
statgpu/metrics/__init__.py +21 -0
statgpu/metrics/_classification.py +591 -0
statgpu/nonparametric/__init__.py +50 -0
statgpu/nonparametric/kernel_methods/__init__.py +25 -0
statgpu/nonparametric/kernel_methods/_kernels.py +246 -0
statgpu/nonparametric/kernel_methods/_krr.py +234 -0
statgpu/nonparametric/kernel_methods/_krr_cv.py +380 -0
statgpu/nonparametric/kernel_smoothing/__init__.py +39 -0
statgpu/nonparametric/kernel_smoothing/_bandwidth_selection.py +1083 -0
statgpu/nonparametric/kernel_smoothing/_kde.py +761 -0
statgpu/nonparametric/kernel_smoothing/_kernel_common.py +348 -0
statgpu/nonparametric/kernel_smoothing/_kernel_regression.py +748 -0
statgpu/nonparametric/splines/__init__.py +5 -0
statgpu/nonparametric/splines/_bspline_basis.py +336 -0
statgpu/nonparametric/splines/_penalized.py +349 -0
statgpu/panel/__init__.py +19 -0
statgpu/panel/_covariance.py +140 -0
statgpu/panel/_fixed_effects.py +420 -0
statgpu/panel/_random_effects.py +385 -0
statgpu/panel/_utils.py +482 -0
statgpu/penalties/__init__.py +139 -0
statgpu/penalties/_adaptive_l1.py +313 -0
statgpu/penalties/_base.py +261 -0
statgpu/penalties/_categories.py +39 -0
statgpu/penalties/_elasticnet.py +98 -0
statgpu/penalties/_group_lasso.py +678 -0
statgpu/penalties/_group_mcp.py +553 -0
statgpu/penalties/_group_scad.py +605 -0
statgpu/penalties/_l1.py +107 -0
statgpu/penalties/_l2.py +77 -0
statgpu/penalties/_mcp.py +237 -0
statgpu/penalties/_scad.py +260 -0
statgpu/semiparametric/__init__.py +5 -0
statgpu/semiparametric/_gam.py +401 -0
statgpu/solvers/__init__.py +24 -0
statgpu/solvers/_admm.py +241 -0
statgpu/solvers/_constants.py +15 -0
statgpu/solvers/_convergence.py +6 -0
statgpu/solvers/_fista.py +436 -0
statgpu/solvers/_fista_bb.py +513 -0
statgpu/solvers/_fista_lla.py +541 -0
statgpu/solvers/_lbfgs.py +206 -0
statgpu/solvers/_newton.py +149 -0
statgpu/solvers/_utils.py +277 -0
statgpu/survival/__init__.py +14 -0
statgpu/survival/_cox.py +3974 -0
statgpu/survival/_cox_breslow_triton_kernel.py +106 -0
statgpu/survival/_cox_cv.py +1159 -0
statgpu/survival/_cox_efron_cuda.py +1280 -0
statgpu/survival/_cox_efron_triton.py +359 -0
statgpu/unsupervised/__init__.py +29 -0
statgpu/unsupervised/_agglomerative.py +307 -0
statgpu/unsupervised/_dbscan.py +263 -0
statgpu/unsupervised/_dbscan_cpu.pyx +125 -0
statgpu/unsupervised/_gmm.py +332 -0
statgpu/unsupervised/_incremental_pca.py +176 -0
statgpu/unsupervised/_kmeans.py +261 -0
statgpu/unsupervised/_minibatch_kmeans.py +299 -0
statgpu/unsupervised/_minibatch_nmf.py +252 -0
statgpu/unsupervised/_nmf.py +190 -0
statgpu/unsupervised/_pca.py +189 -0
statgpu/unsupervised/_truncated_svd.py +132 -0
statgpu/unsupervised/_tsne.py +192 -0
statgpu/unsupervised/_umap.py +224 -0
statgpu/unsupervised/_utils.py +134 -0
statgpu-0.1.0.dist-info/METADATA +245 -0
statgpu-0.1.0.dist-info/RECORD +168 -0
statgpu-0.1.0.dist-info/WHEEL +5 -0
statgpu-0.1.0.dist-info/licenses/LICENSE +199 -0
statgpu-0.1.0.dist-info/top_level.txt +1 -0

statgpu/inference/_resampling.py ADDED Viewed

@@ -0,0 +1,1400 @@
+"""Unified resampling engine for bootstrap and permutation testing."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any, Callable, Dict, Optional, Sequence, Tuple
+import numpy as np
+from statgpu.backends import get_backend, _resolve_backend, _to_float_scalar, _to_numpy, _torch_dev, xp_empty
+import operator
+from functools import reduce
+def _count_elts(arr):
+    """Return total number of elements (works across numpy, cupy, torch)."""
+    return reduce(operator.mul, arr.shape, 1)
+def _coerce_sample_value(x, backend):
+    """Convert statistic output to a scalar array value for samples without forcing host sync."""
+    try:
+        x_arr = backend.asarray(x)
+    except Exception:
+        return float(x)
+    if x_arr.ndim != 0:
+        raise ValueError("statistic must return a scalar value")
+    return backend.astype(x_arr, backend.float64)
+def _coerce_vectorized_values(values, expected_size: int, backend):
+    """Normalize vectorized statistic output to a 1D float64 array or return None."""
+    try:
+        arr = backend.asarray(values)
+    except Exception:
+        return None
+    if arr.ndim == 0:
+        return None
+    if arr.ndim != 1:
+        if _count_elts(arr) != int(expected_size):
+            return None
+        arr = arr.reshape(-1)
+    if int(arr.shape[0]) != int(expected_size):
+        return None
+    return backend.astype(arr, backend.float64)
+def _try_vectorized_statistic(statistic, expected_size: int, backend, *args):
+    """Try vectorized statistic call and return normalized output when compatible."""
+    try:
+        out = statistic(*args)
+    except Exception:
+        return None
+    return _coerce_vectorized_values(out, expected_size, backend)
+def _validate_fastpath_hint(statistic_hint: Optional[str]) -> Optional[str]:
+    if statistic_hint is None:
+        return None
+    hint = str(statistic_hint).strip().lower()
+    if hint in ("", "none"):
+        return None
+    allowed = {"mean", "pearson_corr"}
+    if hint not in allowed:
+        raise ValueError("statistic_hint must be one of: None, 'mean', 'pearson_corr'")
+    return hint
+def _mean_batch_stat(samples_batch, backend):
+    return backend.xp.mean(samples_batch, axis=-1, dtype=backend.float64)
+def _select_single_feature_vector(X, backend):
+    X_arr = backend.asarray(X)
+    if X_arr.ndim == 1:
+        return backend.astype(X_arr, backend.float64)
+    if X_arr.ndim == 2 and int(X_arr.shape[1]) == 1:
+        return backend.astype(X_arr[:, 0], backend.float64)
+    raise ValueError("statistic_hint='pearson_corr' requires X with shape (n,) or (n, 1)")
+def _pearson_corr_with_y_batch(x_vec, y_batch, backend):
+    x = backend.asarray(x_vec, dtype=backend.float64).reshape(-1)
+    y = backend.asarray(y_batch, dtype=backend.float64)
+    x_centered = x - backend.xp.mean(x)
+    x_norm_sq = backend.xp.sum(x_centered * x_centered)
+    if y.ndim == 1:
+        y_centered = y - backend.xp.mean(y)
+        denom = backend.xp.sqrt(x_norm_sq * backend.xp.sum(y_centered * y_centered))
+        denom_safe = backend.xp.where(denom > 0.0, denom, backend.xp.inf)
+        numer = backend.xp.sum(y_centered * x_centered)
+        return numer / denom_safe
+    if y.ndim != 2:
+        raise ValueError("y must be 1D or 2D batch matrix for pearson_corr fastpath")
+    y_centered = y - backend.xp.mean(y, axis=1, keepdims=True)
+    y_norm_sq = backend.xp.sum(y_centered * y_centered, axis=1)
+    denom = backend.xp.sqrt(x_norm_sq * y_norm_sq)
+    denom_safe = backend.xp.where(denom > 0.0, denom, backend.xp.inf)
+    numer = backend.xp.sum(y_centered * x_centered.reshape(1, -1), axis=1)
+    return numer / denom_safe
+def _rng_default(backend_name: str, random_state: Optional[int], device: str = "cuda"):
+    if backend_name == "numpy":
+        return np.random.default_rng(random_state)
+    if backend_name == "torch":
+        import torch
+        g = torch.Generator(device=device)
+        if random_state is not None:
+            g.manual_seed(int(random_state))
+        return g
+    import cupy as cp
+    seed = 0 if random_state is None else int(random_state)
+    return cp.random.RandomState(seed)
+def _rng_integers(rng, low: int, high: int, size, backend_name: str, device: str = "cuda"):
+    if backend_name == "numpy":
+        return rng.integers(low, high, size=size, dtype=np.int64)
+    if backend_name == "torch":
+        import torch
+        return torch.randint(low, high, size, generator=rng, dtype=torch.int64, device=device)
+    if hasattr(rng, "integers"):
+        try:
+            return rng.integers(low, high, size=size, dtype=np.int64)
+        except TypeError:
+            return rng.integers(low, high, size=size)
+    return rng.randint(low, high, size=size, dtype="int64")
+def _rng_permutation(rng, n: int, backend_name: str, device: str = "cuda"):
+    if backend_name == "numpy":
+        return rng.permutation(n)
+    if backend_name == "torch":
+        import torch
+        return torch.randperm(n, generator=rng, dtype=torch.int64, device=device)
+    return rng.permutation(n)
+def _rng_random(rng, size, backend_name: str, dtype=None, device: str = "cuda"):
+    if backend_name == "numpy":
+        if dtype is None:
+            return rng.random(size=size)
+        return rng.random(size=size, dtype=dtype)
+    if backend_name == "torch":
+        import torch
+        if dtype is None:
+            dtype = torch.float64
+        elif not isinstance(dtype, torch.dtype):
+            dtype = torch.from_numpy(np.empty(0, dtype=dtype)).dtype
+        return torch.rand(size, generator=rng, dtype=dtype, device=device)
+    if hasattr(rng, "random"):
+        if dtype is None:
+            return rng.random(size=size)
+        try:
+            return rng.random(size=size, dtype=dtype)
+        except TypeError:
+            out = rng.random(size=size)
+            if hasattr(out, "astype"):
+                return out.astype(dtype, copy=False)
+            return out
+    out = rng.random_sample(size)
+    if dtype is not None and hasattr(out, "astype"):
+        return out.astype(dtype, copy=False)
+    return out
+def _cupy_index_dtype_name(n: int) -> str:
+    return "int32" if int(n) <= np.iinfo(np.int32).max else "int64"
+def _recommend_cupy_batch_size(
+    n: int,
+    n_resamples: int,
+    *,
+    bytes_per_row: int,
+    target_bytes: int,
+    min_batch: int,
+    max_batch: int,
+) -> int:
+    if n <= 0:
+        return 1
+    by_memory = max(1, target_bytes // max(1, bytes_per_row * n))
+    batch = min(max_batch, max(min_batch, by_memory))
+    return max(1, min(batch, int(n_resamples)))
+def _iter_iid_bootstrap_index_batches(rng, n: int, n_resamples: int, backend_name: str, device: str = "cuda"):
+    if backend_name == "numpy":
+        batch_size = _recommend_cupy_batch_size(
+            n, n_resamples, bytes_per_row=8,
+            target_bytes=32 * 1024 * 1024, min_batch=8, max_batch=1024,
+        )
+        for start in range(0, n_resamples, batch_size):
+            cur = min(batch_size, n_resamples - start)
+            idx_batch = _rng_integers(rng, 0, n, size=(cur, n), backend_name=backend_name, device=device)
+            yield idx_batch
+        return
+    if backend_name == "torch":
+        batch_size = _recommend_cupy_batch_size(
+            n, n_resamples, bytes_per_row=8,
+            target_bytes=64 * 1024 * 1024, min_batch=32, max_batch=2048,
+        )
+        for start in range(0, n_resamples, batch_size):
+            cur = min(batch_size, n_resamples - start)
+            idx_batch = _rng_integers(rng, 0, n, size=(cur, n), backend_name=backend_name, device=device)
+            yield idx_batch
+        return
+    # CuPy path: int64 index matrix; keep around ~64MB to balance throughput and memory.
+    batch_size = _recommend_cupy_batch_size(
+        n, n_resamples, bytes_per_row=8,
+        target_bytes=64 * 1024 * 1024, min_batch=32, max_batch=2048,
+    )
+    index_dtype = _cupy_index_dtype_name(n)
+    for start in range(0, n_resamples, batch_size):
+        cur = min(batch_size, n_resamples - start)
+        if hasattr(rng, "integers"):
+            try:
+                idx_batch = rng.integers(0, n, size=(cur, n), dtype=index_dtype)
+            except TypeError:
+                idx_batch = rng.integers(0, n, size=(cur, n))
+        else:
+            idx_batch = rng.randint(0, n, size=(cur, n), dtype=index_dtype)
+        yield idx_batch
+def _iter_iid_permutation_batches(rng, n: int, n_resamples: int, backend_name: str, device: str = "cuda"):
+    if backend_name == "numpy":
+        batch_size = _recommend_cupy_batch_size(
+            n, n_resamples, bytes_per_row=12,
+            target_bytes=24 * 1024 * 1024, min_batch=4, max_batch=256,
+        )
+        for start in range(0, n_resamples, batch_size):
+            cur = min(batch_size, n_resamples - start)
+            keys = _rng_random(rng, (cur, n), backend_name, dtype=np.float32, device=device)
+            perm_batch = np.argsort(keys, axis=1)
+            yield perm_batch
+        return
+    if backend_name == "torch":
+        import torch
+        batch_size = _recommend_cupy_batch_size(
+            n, n_resamples, bytes_per_row=12,
+            target_bytes=48 * 1024 * 1024, min_batch=16, max_batch=2048,
+        )
+        for start in range(0, n_resamples, batch_size):
+            cur = min(batch_size, n_resamples - start)
+            keys = _rng_random(rng, (cur, n), backend_name, dtype=torch.float32, device=device)
+            perm_batch = torch.argsort(keys, dim=1)
+            yield perm_batch
+        return
+    # CuPy path: approx memory per row: float32 random keys + int64 permutation indices.
+    batch_size = _recommend_cupy_batch_size(
+        n, n_resamples, bytes_per_row=12,
+        target_bytes=48 * 1024 * 1024, min_batch=16, max_batch=2048,
+    )
+    import cupy as cp
+    for start in range(0, n_resamples, batch_size):
+        cur = min(batch_size, n_resamples - start)
+        keys = _rng_random(rng, (cur, n), backend_name, dtype=cp.float32)
+        perm_batch = cp.argsort(keys, axis=1)
+        yield perm_batch
+def _iter_stratified_bootstrap_index_batches(
+    rng,
+    state,
+    n_resamples: int,
+    backend_name: str,
+    device: str = "cuda",
+    *,
+    shuffle_rows: bool = True,
+):
+    backend = get_backend(backend_name)
+    strata_rows = state["strata_rows"]
+    strata_rows_matrix = state.get("strata_rows_matrix")
+    strata_uniform_size = state.get("strata_uniform_size")
+    n = int(state["n_samples"])
+    if backend_name == "numpy":
+        target = 24 * 1024 * 1024
+        min_batch = 4
+        max_batch = 512
+        key_dtype = np.float32
+    elif backend_name == "torch":
+        import torch
+        target = 64 * 1024 * 1024
+        min_batch = 16
+        max_batch = 1024
+        key_dtype = torch.float32
+    else:
+        target = 64 * 1024 * 1024
+        min_batch = 16
+        max_batch = 1024
+        import cupy as cp
+        key_dtype = cp.float32
+    bytes_per_row = 8 * n + (4 * n if shuffle_rows else 0)
+    batch_size = _recommend_cupy_batch_size(
+        n, n_resamples, bytes_per_row=bytes_per_row,
+        target_bytes=target, min_batch=min_batch, max_batch=max_batch,
+    )
+    for start in range(0, n_resamples, batch_size):
+        cur = min(batch_size, n_resamples - start)
+        if strata_rows_matrix is not None and strata_uniform_size is not None:
+            n_strata = int(strata_rows_matrix.shape[0])
+            m = int(strata_uniform_size)
+            sampled_local = _rng_integers(
+                rng, 0, m, size=(cur, n_strata, m), backend_name=backend_name, device=device,
+            )
+            strata_ids = backend.arange(n_strata, dtype=backend.int64).reshape(1, n_strata, 1)
+            idx_batch = strata_rows_matrix[strata_ids, sampled_local].reshape(cur, -1)
+        else:
+            idx_batch = xp_empty((cur, n), backend.int64, backend.xp, strata_rows[0])
+            offset = 0
+            for pos in strata_rows:
+                m = int(_count_elts(pos))
+                sampled_local = _rng_integers(rng, 0, m, size=(cur, m), backend_name=backend_name, device=device)
+                idx_batch[:, offset : offset + m] = pos[sampled_local]
+                offset += m
+        if shuffle_rows:
+            keys = _rng_random(rng, (cur, n), backend_name, dtype=key_dtype, device=device)
+            perm = backend.xp.argsort(keys, axis=1)
+            idx_batch = backend.take_along_axis(idx_batch, perm, axis=1)
+        yield idx_batch
+def _iter_block_bootstrap_index_batches(
+    rng,
+    state,
+    n_resamples: int,
+    backend_name: str,
+    device: str = "cuda",
+):
+    backend = get_backend(backend_name)
+    n = int(state["n_samples"])
+    b = int(state["block_size"])
+    n_blocks = int(state["n_blocks"])
+    max_start = int(state["max_start"])
+    if backend_name == "numpy":
+        target = 24 * 1024 * 1024
+        min_batch = 4
+        max_batch = 512
+    elif backend_name == "torch":
+        target = 64 * 1024 * 1024
+        min_batch = 16
+        max_batch = 1024
+    else:
+        target = 64 * 1024 * 1024
+        min_batch = 16
+        max_batch = 1024
+    bytes_per_row = 8 * max(1, n)
+    batch_size = _recommend_cupy_batch_size(
+        max(1, n_blocks), n_resamples, bytes_per_row=bytes_per_row,
+        target_bytes=target, min_batch=min_batch, max_batch=max_batch,
+    )
+    offsets = backend.arange(b, dtype=backend.int64).reshape(1, 1, b)
+    for start in range(0, n_resamples, batch_size):
+        cur = min(batch_size, n_resamples - start)
+        starts = _rng_integers(rng, 0, max_start, size=(cur, n_blocks), backend_name=backend_name, device=device)
+        idx_batch = (starts[:, :, None] + offsets).reshape(cur, -1)
+        yield backend.astype(idx_batch[:, :n], backend.int64)
+def _iter_cluster_bootstrap_index_batches(
+    rng,
+    state,
+    n_resamples: int,
+    backend_name: str,
+    device: str = "cuda",
+):
+    """Batch cluster bootstrap index generation for uniform cluster sizes."""
+    backend = get_backend(backend_name)
+    n = int(state["n_samples"])
+    n_clusters = int(state["n_clusters"])
+    rows_matrix = state.get("cluster_rows_matrix")
+    uniform_size = state.get("cluster_uniform_size")
+    if rows_matrix is None or uniform_size is None:
+        raise ValueError("Batched cluster bootstrap requires uniform cluster sizes")
+    m = int(uniform_size)
+    draws = int(np.ceil(n / max(1, m)))
+    total_len = draws * m
+    if backend_name == "numpy":
+        target = 24 * 1024 * 1024
+        min_batch = 4
+        max_batch = 512
+    elif backend_name == "torch":
+        target = 64 * 1024 * 1024
+        min_batch = 16
+        max_batch = 1024
+    else:
+        target = 64 * 1024 * 1024
+        min_batch = 16
+        max_batch = 1024
+    batch_size = _recommend_cupy_batch_size(
+        max(1, total_len), n_resamples, bytes_per_row=8,
+        target_bytes=target, min_batch=min_batch, max_batch=max_batch,
+    )
+    for start in range(0, n_resamples, batch_size):
+        cur = min(batch_size, n_resamples - start)
+        cluster_ids = _rng_integers(rng, 0, n_clusters, size=(cur, draws), backend_name=backend_name, device=device)
+        idx_batch = rows_matrix[cluster_ids].reshape(cur, -1)
+        yield backend.astype(idx_batch[:, :n], backend.int64)
+def _iter_non_iid_bootstrap_index_batches(
+    rng,
+    state,
+    n_resamples: int,
+    backend_name: str,
+    device: str = "cuda",
+    *,
+    shuffle_rows: bool = True,
+):
+    strategy_n = state["strategy"]
+    if strategy_n == "stratified":
+        yield from _iter_stratified_bootstrap_index_batches(
+            rng,
+            state,
+            n_resamples,
+            backend_name,
+            device=device,
+            shuffle_rows=shuffle_rows,
+        )
+        return
+    if strategy_n == "block":
+        yield from _iter_block_bootstrap_index_batches(rng, state, n_resamples, backend_name, device=device)
+        return
+    if strategy_n == "cluster":
+        yield from _iter_cluster_bootstrap_index_batches(rng, state, n_resamples, backend_name, device=device)
+        return
+    raise ValueError("Batched non-IID bootstrap supports only 'stratified', 'cluster', and 'block'")
+def _iter_labelwise_permuted_y_batches(
+    rng,
+    y,
+    state,
+    n_resamples: int,
+    backend_name: str,
+    device: str = "cuda",
+):
+    backend = get_backend(backend_name)
+    y_arr = backend.asarray(y)
+    n = int(state["n_samples"])
+    label_rows = state["label_rows"]
+    dense_label_rows = state.get("dense_label_rows")
+    dense_valid_mask = state.get("dense_valid_mask")
+    dense_valid_flat = state.get("dense_valid_flat")
+    dense_pos_valid = state.get("dense_pos_valid")
+    label_sizes = state.get("label_sizes")
+    use_dense = (
+        dense_label_rows is not None
+        and dense_valid_mask is not None
+        and dense_valid_flat is not None
+        and dense_pos_valid is not None
+        and label_sizes is not None
+    )
+    if backend_name == "numpy":
+        target = 24 * 1024 * 1024
+        min_batch = 4
+        max_batch = 512
+        key_dtype = np.float32
+    elif backend_name == "torch":
+        import torch
+        target = 64 * 1024 * 1024
+        min_batch = 16
+        max_batch = 1024
+        key_dtype = torch.float32
+    else:
+        target = 64 * 1024 * 1024
+        min_batch = 16
+        max_batch = 1024
+        import cupy as cp
+        key_dtype = cp.float32
+    if use_dense:
+        n_labels = int(dense_label_rows.shape[0])
+        max_label_size = int(dense_label_rows.shape[1])
+        dense_elems = n_labels * max_label_size
+        bytes_per_row = max(8, y_arr.dtype.itemsize) * n + 12 * dense_elems
+        size_for_batch = max(1, dense_elems)
+    else:
+        bytes_per_row = max(8, y_arr.dtype.itemsize) * n + 4 * n
+        size_for_batch = n
+    batch_size = _recommend_cupy_batch_size(
+        size_for_batch,
+        n_resamples,
+        bytes_per_row=bytes_per_row,
+        target_bytes=target,
+        min_batch=min_batch,
+        max_batch=max_batch,
+    )
+    for start in range(0, n_resamples, batch_size):
+        cur = min(batch_size, n_resamples - start)
+        y_batch = xp_empty((cur, n), y_arr.dtype, backend.xp, y_arr)
+        if use_dense:
+            keys = _rng_random(
+                rng,
+                (cur, int(dense_label_rows.shape[0]), int(dense_label_rows.shape[1])),
+                backend_name,
+                dtype=key_dtype,
+                device=device,
+            )
+            keys = backend.xp.where(dense_valid_mask.reshape(1, *dense_valid_mask.shape), keys, backend.xp.inf)
+            perm_dense = backend.xp.argsort(keys, axis=2)
+            shuffled_dense = backend.take_along_axis(
+                dense_label_rows.reshape(1, *dense_label_rows.shape),
+                perm_dense,
+                axis=2,
+            )
+            # Flatten valid entries once and write all groups in one vectorized assignment.
+            shuffled_valid = shuffled_dense.reshape(cur, -1)[:, dense_valid_flat]
+            y_batch[:, dense_pos_valid] = y_arr[shuffled_valid]
+            yield y_batch
+            continue
+        for pos in label_rows:
+            m = int(_count_elts(pos))
+            if m == 1:
+                y_batch[:, pos] = y_arr[pos]
+                continue
+            keys = _rng_random(rng, (cur, m), backend_name, dtype=key_dtype, device=device)
+            perm = backend.xp.argsort(keys, axis=1)
+            y_batch[:, pos] = y_arr[pos][perm]
+        yield y_batch
+@dataclass
+class BootstrapResult:
+    """Result object for bootstrap-based statistics."""
+    statistic_name: str
+    strategy: str
+    observed: float
+    samples: Any
+    confidence_interval: Tuple[float, float]
+    confidence_level: float
+    n_resamples: int
+    random_state: Optional[int]
+    metadata: Dict[str, Any] = field(default_factory=dict)
+    def to_dict(self) -> Dict[str, Any]:
+        samples_np = _to_numpy(self.samples)
+        return {
+            "statistic_name": self.statistic_name,
+            "strategy": self.strategy,
+            "observed": float(self.observed),
+            "samples": samples_np.tolist(),
+            "confidence_interval": [
+                float(self.confidence_interval[0]),
+                float(self.confidence_interval[1]),
+            ],
+            "confidence_level": float(self.confidence_level),
+            "n_resamples": int(self.n_resamples),
+            "random_state": self.random_state,
+            "metadata": self.metadata,
+        }
+    def to_dataframe(self):
+        try:
+            import pandas as pd
+        except ImportError as exc:
+            raise ImportError("pandas is required for to_dataframe()") from exc
+        samples_np = _to_numpy(self.samples)
+        return pd.DataFrame(
+            {
+                "sample_index": np.arange(samples_np.size, dtype=int),
+                "statistic": samples_np,
+            }
+        )
+@dataclass
+class PermutationTestResult:
+    """Result object for permutation tests."""
+    statistic_name: str
+    strategy: str
+    alternative: str
+    observed: float
+    samples: Any
+    pvalue: float
+    n_resamples: int
+    random_state: Optional[int]
+    metadata: Dict[str, Any] = field(default_factory=dict)
+    def to_dict(self) -> Dict[str, Any]:
+        samples_np = _to_numpy(self.samples)
+        return {
+            "statistic_name": self.statistic_name,
+            "strategy": self.strategy,
+            "alternative": self.alternative,
+            "observed": float(self.observed),
+            "samples": samples_np.tolist(),
+            "pvalue": float(self.pvalue),
+            "n_resamples": int(self.n_resamples),
+            "random_state": self.random_state,
+            "metadata": self.metadata,
+        }
+    def to_dataframe(self):
+        try:
+            import pandas as pd
+        except ImportError as exc:
+            raise ImportError("pandas is required for to_dataframe()") from exc
+        samples_np = _to_numpy(self.samples)
+        return pd.DataFrame(
+            {
+                "sample_index": np.arange(samples_np.size, dtype=int),
+                "statistic": samples_np,
+            }
+        )
+def _validate_confidence_level(confidence_level: float) -> float:
+    level = float(confidence_level)
+    if level <= 0.0 or level >= 1.0:
+        raise ValueError("confidence_level must be in (0, 1)")
+    return level
+def _validate_n_resamples(n_resamples: int) -> int:
+    n = int(n_resamples)
+    if n <= 0:
+        raise ValueError("n_resamples must be a positive integer")
+    return n
+def _ensure_same_first_dim(arrays: Sequence[Any]) -> int:
+    if len(arrays) == 0:
+        raise ValueError("At least one array is required")
+    n = arrays[0].shape[0]
+    for arr in arrays[1:]:
+        if arr.shape[0] != n:
+            raise ValueError("All arrays must have the same length in axis 0")
+    return n
+def _bootstrap_indices_iid(rng, n: int, backend_name: str, device: str = "cuda"):
+    return _rng_integers(rng, 0, n, size=n, backend_name=backend_name, device=device)
+def _prepare_bootstrap_state(
+    n: int,
+    strategy: str,
+    strata,
+    clusters,
+    block_size: Optional[int],
+    backend_name: str,
+):
+    backend = get_backend(backend_name)
+    strategy_n = str(strategy).strip().lower()
+    if strategy_n == "iid":
+        return {"strategy": strategy_n, "n_samples": int(n)}
+    if strategy_n == "stratified":
+        if strata is None:
+            raise ValueError("strata is required when strategy='stratified'")
+        strata_arr = backend.asarray(strata).reshape(-1)
+        if int(strata_arr.shape[0]) != n:
+            raise ValueError("strata must have the same length as arrays")
+        labels = backend.xp.unique(strata_arr)
+        rows = tuple(backend.astype(backend.xp.where(strata_arr == label)[0], backend.int64) for label in labels)
+        sizes = np.asarray([int(_count_elts(r)) for r in rows], dtype=np.int64)
+        uniform_size = int(sizes[0]) if sizes.size > 0 and np.all(sizes == sizes[0]) else None
+        rows_matrix = None
+        if uniform_size is not None:
+            rows_matrix = backend.astype(backend.xp.stack(rows, axis=0), backend.int64)
+        return {
+            "strategy": strategy_n,
+            "n_samples": int(n),
+            "strata_rows": rows,
+            "strata_sizes": tuple(int(s) for s in sizes.tolist()),
+            "strata_uniform_size": uniform_size,
+            "strata_rows_matrix": rows_matrix,
+        }
+    if strategy_n == "cluster":
+        if clusters is None:
+            raise ValueError("clusters is required when strategy='cluster'")
+        clusters_arr = backend.asarray(clusters).reshape(-1)
+        if int(clusters_arr.shape[0]) != n:
+            raise ValueError("clusters must have the same length as arrays")
+        labels = backend.xp.unique(clusters_arr)
+        rows = tuple(backend.astype(backend.xp.where(clusters_arr == label)[0], backend.int64) for label in labels)
+        if len(rows) == 0:
+            raise ValueError("clusters must contain at least one group")
+        sizes = np.asarray([int(_count_elts(r)) for r in rows], dtype=np.int64)
+        avg_size = float(np.mean(sizes)) if sizes.size > 0 else 1.0
+        avg_size = max(avg_size, 1.0)
+        uniform_size = int(sizes[0]) if np.all(sizes == sizes[0]) else None
+        rows_matrix = None
+        if uniform_size is not None:
+            # Uniform clusters can be assembled in dense batched form without padding/masking.
+            rows_matrix = backend.astype(backend.xp.stack(rows, axis=0), backend.int64)
+        return {
+            "strategy": strategy_n,
+            "n_samples": int(n),
+            "cluster_rows": rows,
+            "cluster_sizes": sizes,
+            "n_clusters": len(rows),
+            "avg_cluster_size": avg_size,
+            "cluster_uniform_size": uniform_size,
+            "cluster_rows_matrix": rows_matrix,
+        }
+    if strategy_n == "block":
+        b = int(block_size) if block_size is not None else 0
+        if b <= 0:
+            raise ValueError("block_size must be a positive integer for block bootstrap")
+        b_eff = min(b, n)
+        n_blocks = int(np.ceil(n / b_eff))
+        max_start = max(1, n - b_eff + 1)
+        return {
+            "strategy": strategy_n,
+            "n_samples": int(n),
+            "block_size": b_eff,
+            "n_blocks": n_blocks,
+            "max_start": max_start,
+        }
+    raise ValueError("strategy must be one of: 'iid', 'stratified', 'cluster', 'block'")
+def _bootstrap_indices_stratified(
+    rng,
+    state,
+    backend_name: str,
+    device: str = "cuda",
+):
+    backend = get_backend(backend_name)
+    chunks = []
+    n = 0
+    for pos in state["strata_rows"]:
+        pos_n = int(_count_elts(pos))
+        sampled_local = _rng_integers(rng, 0, pos_n, size=pos_n, backend_name=backend_name, device=device)
+        chunks.append(pos[sampled_local])
+        n += pos_n
+    idx = backend.concatenate(chunks) if chunks else xp_empty((0,), backend.int64, backend.xp, state["strata_rows"][0])
+    if int(_count_elts(idx)) != int(n):
+        raise RuntimeError("Stratified bootstrap produced invalid sample size")
+    perm = _rng_permutation(rng, int(_count_elts(idx)), backend_name, device=device)
+    return backend.astype(idx[perm], backend.int64)
+def _bootstrap_indices_cluster(
+    rng,
+    n: int,
+    state,
+    backend_name: str,
+    device: str = "cuda",
+):
+    backend = get_backend(backend_name)
+    cluster_rows = state["cluster_rows"]
+    cluster_sizes = state["cluster_sizes"]
+    n_clusters = int(state["n_clusters"])
+    avg_size = float(state["avg_cluster_size"])
+    # Sample cluster ids in batches to avoid scalar sync per sampled cluster.
+    selected_ids = []
+    total_size = 0
+    batch = max(4, int(np.ceil(n / avg_size)))
+    while total_size < n:
+        ids = _rng_integers(rng, 0, n_clusters, size=batch, backend_name=backend_name, device=device)
+        ids_np = _to_numpy(ids).astype(np.int64, copy=False)
+        selected_ids.extend(ids_np.tolist())
+        total_size += int(cluster_sizes[ids_np].sum())
+        if total_size < n:
+            remaining = n - total_size
+            batch = max(1, int(np.ceil(remaining / avg_size)) + 1)
+    chunks = []
+    filled = 0
+    for cid in selected_ids:
+        rows = cluster_rows[int(cid)]
+        chunks.append(rows)
+        filled += int(_count_elts(rows))
+        if filled >= n:
+            break
+    _ref = cluster_rows[0] if len(cluster_rows) > 0 else None
+    idx = backend.concatenate(chunks)[:n] if chunks else xp_empty((0,), backend.int64, backend.xp, _ref)
+    return backend.astype(idx, backend.int64)
+def _bootstrap_indices_block(
+    rng,
+    n: int,
+    state,
+    backend_name: str,
+    device: str = "cuda",
+):
+    backend = get_backend(backend_name)
+    b = int(state["block_size"])
+    n_blocks = int(state["n_blocks"])
+    max_start = int(state["max_start"])
+    starts = _rng_integers(rng, 0, max_start, size=n_blocks, backend_name=backend_name, device=device)
+    offsets = backend.arange(b, dtype=backend.int64)
+    idx = (starts.reshape(-1, 1) + offsets.reshape(1, -1)).reshape(-1)
+    return backend.astype(idx[:n], backend.int64)
+def _build_bootstrap_indices(
+    rng,
+    n: int,
+    state,
+    backend_name: str,
+    device: str = "cuda",
+):
+    strategy_n = state["strategy"]
+    if strategy_n == "iid":
+        return _bootstrap_indices_iid(rng, n, backend_name, device=device)
+    if strategy_n == "stratified":
+        return _bootstrap_indices_stratified(rng, state, backend_name, device=device)
+    if strategy_n == "cluster":
+        return _bootstrap_indices_cluster(rng, n, state, backend_name, device=device)
+    if strategy_n == "block":
+        return _bootstrap_indices_block(rng, n, state, backend_name, device=device)
+    raise ValueError("strategy must be one of: 'iid', 'stratified', 'cluster', 'block'")
+def bootstrap_statistic(
+    statistic: Callable[..., float],
+    *arrays,
+    n_resamples: int = 200,
+    strategy: str = "iid",
+    strata=None,
+    clusters=None,
+    block_size: Optional[int] = None,
+    confidence_level: float = 0.95,
+    random_state: Optional[int] = None,
+    statistic_name: str = "statistic",
+    backend: str = "auto",
+    force_vectorized: bool = False,
+    statistic_hint: Optional[str] = None,
+) -> BootstrapResult:
+    """
+    Generic bootstrap engine over one or multiple aligned arrays.
+    Parameters
+    ----------
+    statistic : callable
+        A function receiving resampled arrays and returning a scalar.
+        On CuPy IID paths, a vectorized callable is also supported:
+        if called with batched samples and it returns a vector of length
+        ``batch_size``, that vectorized output is used directly.
+    *arrays : array-like
+        One or more arrays with aligned first dimension.
+    n_resamples : int, default=200
+        Number of bootstrap resamples.
+    strategy : {'iid', 'stratified', 'cluster', 'block'}, default='iid'
+        Resampling strategy.
+    strata : array-like, optional
+        Strata labels used by stratified bootstrap.
+    clusters : array-like, optional
+        Cluster labels used by cluster bootstrap.
+    block_size : int, optional
+        Block size for block bootstrap.
+    confidence_level : float, default=0.95
+        Confidence level for percentile CI.
+    random_state : int, optional
+        Seed for reproducibility.
+    statistic_name : str, default='statistic'
+        Name to attach to the result object.
+    backend : {'auto', 'numpy', 'cupy'}, default='auto'
+        Backend selection. 'auto' infers from input arrays.
+    force_vectorized : bool, default=False
+        If True, require the statistic callable (or fastpath) to produce
+        vectorized batch output on IID path; raises if unavailable.
+    statistic_hint : {'mean', 'pearson_corr'} or None, default=None
+        Optional built-in fastpath hint. For bootstrap, ``'mean'`` enables
+        direct batch mean computation on IID path.
+    Returns
+    -------
+    BootstrapResult
+        Structured bootstrap result with samples and confidence interval.
+    """
+    n_boot = _validate_n_resamples(n_resamples)
+    level = _validate_confidence_level(confidence_level)
+    backend_name = _resolve_backend(backend, *arrays, strata, clusters)
+    backend = get_backend(backend_name)
+    arrays_xp = [backend.asarray(a) for a in arrays]
+    n = _ensure_same_first_dim(arrays_xp)
+    if strata is not None and backend.asarray(strata).shape[0] != n:
+        raise ValueError("strata must have the same length as arrays")
+    if clusters is not None and backend.asarray(clusters).shape[0] != n:
+        raise ValueError("clusters must have the same length as arrays")
+    observed = _to_float_scalar(statistic(*arrays_xp))
+    fastpath_hint = _validate_fastpath_hint(statistic_hint)
+    bootstrap_state = _prepare_bootstrap_state(
+        n,
+        strategy,
+        strata,
+        clusters,
+        block_size,
+        backend_name,
+    )
+    if backend_name == "torch":
+        rng_device = str(arrays_xp[0].device)
+    else:
+        rng_device = "cuda"
+    rng = _rng_default(backend_name, random_state, device=rng_device)
+    samples = xp_empty(n_boot, backend.float64, backend.xp, arrays_xp[0])
+    strategy_n = bootstrap_state["strategy"]
+    if strategy_n == "iid":
+        vectorized_mode = None
+        write_pos = 0
+        for idx_batch in _iter_iid_bootstrap_index_batches(rng, n, n_boot, backend_name, device=rng_device):
+            cur = int(idx_batch.shape[0])
+            if fastpath_hint == "mean":
+                if len(arrays_xp) != 1:
+                    raise ValueError("statistic_hint='mean' requires a single input array")
+                sampled_batch = arrays_xp[0][idx_batch]
+                samples[write_pos : write_pos + cur] = _mean_batch_stat(sampled_batch, backend)
+                write_pos += cur
+                continue
+            if len(arrays_xp) == 1:
+                sampled_batch = arrays_xp[0][idx_batch]
+                if vectorized_mode is not False:
+                    vec_values = _try_vectorized_statistic(statistic, cur, backend, sampled_batch)
+                    if vec_values is not None:
+                        samples[write_pos : write_pos + cur] = vec_values
+                        vectorized_mode = True
+                        write_pos += cur
+                        continue
+                    if vectorized_mode is None:
+                        if force_vectorized:
+                            raise ValueError(
+                                "force_vectorized=True but statistic did not return "
+                                "a vector of length batch_size"
+                            )
+                        vectorized_mode = False
+                for j in range(cur):
+                    samples[write_pos + j] = _coerce_sample_value(statistic(sampled_batch[j]), backend)
+            else:
+                sampled_args_batch = [arr[idx_batch] for arr in arrays_xp]
+                if vectorized_mode is not False:
+                    vec_values = _try_vectorized_statistic(
+                        statistic,
+                        cur,
+                        backend,
+                        *sampled_args_batch,
+                    )
+                    if vec_values is not None:
+                        samples[write_pos : write_pos + cur] = vec_values
+                        vectorized_mode = True
+                        write_pos += cur
+                        continue
+                    if vectorized_mode is None:
+                        if force_vectorized:
+                            raise ValueError(
+                                "force_vectorized=True but statistic did not return "
+                                "a vector of length batch_size"
+                            )
+                        vectorized_mode = False
+                for j in range(cur):
+                    sampled_args = [arr[j] for arr in sampled_args_batch]
+                    samples[write_pos + j] = _coerce_sample_value(statistic(*sampled_args), backend)
+            write_pos += cur
+    elif strategy_n in ("stratified", "block") or (
+        strategy_n == "cluster" and bootstrap_state.get("cluster_rows_matrix") is not None
+    ):
+        vectorized_mode = None
+        write_pos = 0
+        shuffle_rows = not (fastpath_hint == "mean")
+        for idx_batch in _iter_non_iid_bootstrap_index_batches(
+            rng,
+            bootstrap_state,
+            n_boot,
+            backend_name,
+            device=rng_device,
+            shuffle_rows=shuffle_rows,
+        ):
+            cur = int(idx_batch.shape[0])
+            if fastpath_hint == "mean":
+                if len(arrays_xp) != 1:
+                    raise ValueError("statistic_hint='mean' requires a single input array")
+                sampled_batch = arrays_xp[0][idx_batch]
+                samples[write_pos : write_pos + cur] = _mean_batch_stat(sampled_batch, backend)
+                write_pos += cur
+                continue
+            if len(arrays_xp) == 1:
+                sampled_batch = arrays_xp[0][idx_batch]
+                if vectorized_mode is not False:
+                    vec_values = _try_vectorized_statistic(statistic, cur, backend, sampled_batch)
+                    if vec_values is not None:
+                        samples[write_pos : write_pos + cur] = vec_values
+                        vectorized_mode = True
+                        write_pos += cur
+                        continue
+                    if vectorized_mode is None:
+                        if force_vectorized:
+                            raise ValueError(
+                                "force_vectorized=True but statistic did not return "
+                                "a vector of length batch_size"
+                            )
+                        vectorized_mode = False
+                for j in range(cur):
+                    samples[write_pos + j] = _coerce_sample_value(statistic(sampled_batch[j]), backend)
+            else:
+                sampled_args_batch = [arr[idx_batch] for arr in arrays_xp]
+                if vectorized_mode is not False:
+                    vec_values = _try_vectorized_statistic(
+                        statistic,
+                        cur,
+                        backend,
+                        *sampled_args_batch,
+                    )
+                    if vec_values is not None:
+                        samples[write_pos : write_pos + cur] = vec_values
+                        vectorized_mode = True
+                        write_pos += cur
+                        continue
+                    if vectorized_mode is None:
+                        if force_vectorized:
+                            raise ValueError(
+                                "force_vectorized=True but statistic did not return "
+                                "a vector of length batch_size"
+                            )
+                        vectorized_mode = False
+                for j in range(cur):
+                    sampled_args = [arr[j] for arr in sampled_args_batch]
+                    samples[write_pos + j] = _coerce_sample_value(statistic(*sampled_args), backend)
+            write_pos += cur
+    else:
+        for i in range(n_boot):
+            idx = _build_bootstrap_indices(
+                rng,
+                n,
+                bootstrap_state,
+                backend_name,
+                device=rng_device,
+            )
+            sampled_args = [arr[idx] for arr in arrays_xp]
+            samples[i] = _coerce_sample_value(statistic(*sampled_args), backend)
+    alpha = 1.0 - level
+    ci = (
+        _to_float_scalar(backend.xp.quantile(samples, alpha / 2.0)),
+        _to_float_scalar(backend.xp.quantile(samples, 1.0 - alpha / 2.0)),
+    )
+    return BootstrapResult(
+        statistic_name=str(statistic_name),
+        strategy=str(strategy).lower(),
+        observed=observed,
+        samples=samples,
+        confidence_interval=ci,
+        confidence_level=level,
+        n_resamples=n_boot,
+        random_state=random_state,
+        metadata={"n_samples": n, "backend": backend_name},
+    )
+def _permute_y(
+    rng,
+    y,
+    state,
+    backend_name: str,
+    device: str = "cuda",
+):
+    backend = get_backend(backend_name)
+    strategy_n = state["strategy"]
+    y_arr = backend.asarray(y)
+    if strategy_n == "iid":
+        perm = _rng_permutation(rng, int(y_arr.shape[0]), backend_name, device=device)
+        return y_arr[perm]
+    if strategy_n in ("stratified", "grouped"):
+        y_perm = y_arr.copy()
+        for pos in state["label_rows"]:
+            shuffled_pos = pos[_rng_permutation(rng, int(_count_elts(pos)), backend_name, device=device)]
+            y_perm[pos] = y_arr[shuffled_pos]
+        return y_perm
+    raise ValueError("strategy must be one of: 'iid', 'stratified', 'grouped'")
+def _prepare_permutation_state(
+    n: int,
+    strategy: str,
+    strata,
+    groups,
+    backend_name: str,
+):
+    backend = get_backend(backend_name)
+    strategy_n = str(strategy).strip().lower()
+    if strategy_n == "iid":
+        return {"strategy": strategy_n, "n_samples": int(n)}
+    if strategy_n in ("stratified", "grouped"):
+        labels = strata if strategy_n == "stratified" else groups
+        if labels is None:
+            key = "strata" if strategy_n == "stratified" else "groups"
+            raise ValueError(f"{key} is required when strategy='{strategy_n}'")
+        labels_arr = backend.asarray(labels).reshape(-1)
+        if int(labels_arr.shape[0]) != n:
+            raise ValueError("labels must have same length as y")
+        unique_labels = backend.xp.unique(labels_arr)
+        label_rows = tuple(backend.astype(backend.xp.where(labels_arr == label)[0], backend.int64) for label in unique_labels)
+        dense_label_rows = None
+        dense_valid_mask = None
+        dense_valid_flat = None
+        dense_pos_valid = None
+        label_sizes = tuple(int(_count_elts(pos)) for pos in label_rows)
+        # Build a dense label matrix for CuPy when groups are not too ragged.
+        if backend_name == "cupy" and len(label_sizes) > 0:
+            max_label_size = max(label_sizes)
+            if max_label_size > 1:
+                fill_ratio = float(n) / float(len(label_sizes) * max_label_size)
+                if fill_ratio >= 0.60:
+                    dense_label_rows = backend.full((len(label_rows), max_label_size), -1, dtype=backend.int64)
+                    dense_valid_mask = backend.xp.zeros((len(label_rows), max_label_size), dtype=bool)
+                    for i, pos in enumerate(label_rows):
+                        m = label_sizes[i]
+                        dense_label_rows[i, :m] = pos
+                        dense_valid_mask[i, :m] = True
+                    dense_valid_flat = dense_valid_mask.reshape(-1)
+                    dense_pos_valid = dense_label_rows.reshape(-1)[dense_valid_flat]
+        return {
+            "strategy": strategy_n,
+            "n_samples": int(n),
+            "label_rows": label_rows,
+            "label_sizes": label_sizes,
+            "dense_label_rows": dense_label_rows,
+            "dense_valid_mask": dense_valid_mask,
+            "dense_valid_flat": dense_valid_flat,
+            "dense_pos_valid": dense_pos_valid,
+        }
+    raise ValueError("strategy must be one of: 'iid', 'stratified', 'grouped'")
+def permutation_test(
+    statistic: Callable[[Any, Any], float],
+    X,
+    y,
+    n_resamples: int = 1000,
+    strategy: str = "iid",
+    strata=None,
+    groups=None,
+    alternative: str = "two-sided",
+    random_state: Optional[int] = None,
+    statistic_name: str = "statistic",
+    backend: str = "auto",
+    force_vectorized: bool = False,
+    statistic_hint: Optional[str] = None,
+) -> PermutationTestResult:
+    """
+    Generic permutation test for a supervised statistic ``statistic(X, y)``.
+    Parameters
+    ----------
+    statistic : callable
+        Function receiving ``(X, y)`` and returning a scalar.
+        On CuPy IID paths, vectorized output is supported when ``y`` is a
+        batch matrix and the callable returns a vector with one value per row.
+    X : array-like
+        Feature matrix.
+    y : array-like
+        Response vector.
+    n_resamples : int, default=1000
+        Number of permutation resamples.
+    strategy : {'iid', 'stratified', 'grouped'}, default='iid'
+        Permutation strategy. 'grouped' permutes within groups.
+    strata : array-like, optional
+        Strata labels used by strategy='stratified'.
+    groups : array-like, optional
+        Group labels used by strategy='grouped'.
+    alternative : {'two-sided', 'greater', 'less'}, default='two-sided'
+        Alternative hypothesis.
+    random_state : int, optional
+        Random seed.
+    statistic_name : str, default='statistic'
+        Name to attach to the result object.
+    backend : {'auto', 'numpy', 'cupy'}, default='auto'
+        Backend selection. 'auto' infers from input arrays.
+    force_vectorized : bool, default=False
+        If True, require vectorized batch output on IID path; raises if
+        statistic is not vectorized-compatible.
+    statistic_hint : {'mean', 'pearson_corr'} or None, default=None
+        Optional built-in fastpath hint. For permutation, ``'pearson_corr'``
+        computes Pearson correlation in vectorized batches for IID path.
+    Returns
+    -------
+    PermutationTestResult
+        Structured permutation test result with empirical p-value.
+    """
+    n_perm = _validate_n_resamples(n_resamples)
+    alt = str(alternative).strip().lower()
+    if alt not in ("two-sided", "greater", "less"):
+        raise ValueError("alternative must be one of: 'two-sided', 'greater', 'less'")
+    backend_name = _resolve_backend(backend, X, y, strata, groups)
+    backend = get_backend(backend_name)
+    X_arr = backend.asarray(X)
+    y_arr = backend.asarray(y).reshape(-1)
+    if X_arr.shape[0] != y_arr.shape[0]:
+        raise ValueError("X and y must have the same number of rows")
+    observed = _to_float_scalar(statistic(X_arr, y_arr))
+    fastpath_hint = _validate_fastpath_hint(statistic_hint)
+    permutation_state = _prepare_permutation_state(
+        int(y_arr.shape[0]),
+        strategy,
+        strata,
+        groups,
+        backend_name,
+    )
+    if backend_name == "torch":
+        rng_device = str(y_arr.device)
+    else:
+        rng_device = "cuda"
+    rng = _rng_default(backend_name, random_state, device=rng_device)
+    samples = xp_empty(n_perm, backend.float64, backend.xp, y_arr)
+    strategy_n = permutation_state["strategy"]
+    x_vec_fast = None
+    if fastpath_hint == "pearson_corr":
+        x_vec_fast = _select_single_feature_vector(X_arr, backend)
+    if strategy_n == "iid":
+        vectorized_mode = None
+        write_pos = 0
+        for perm_batch in _iter_iid_permutation_batches(
+            rng,
+            int(y_arr.shape[0]),
+            n_perm,
+            backend_name,
+            device=rng_device,
+        ):
+            cur = int(perm_batch.shape[0])
+            y_perm_batch = y_arr[perm_batch]
+            if fastpath_hint == "pearson_corr":
+                corr_batch = _pearson_corr_with_y_batch(x_vec_fast, y_perm_batch, backend)
+                samples[write_pos : write_pos + cur] = _coerce_vectorized_values(corr_batch, cur, backend)
+                write_pos += cur
+                continue
+            if vectorized_mode is not False:
+                vec_values = _try_vectorized_statistic(
+                    statistic,
+                    cur,
+                    backend,
+                    X_arr,
+                    y_perm_batch,
+                )
+                if vec_values is not None:
+                    samples[write_pos : write_pos + cur] = vec_values
+                    vectorized_mode = True
+                    write_pos += cur
+                    continue
+                if vectorized_mode is None:
+                    if force_vectorized:
+                        raise ValueError(
+                            "force_vectorized=True but statistic did not return "
+                            "a vector of length batch_size"
+                        )
+                    vectorized_mode = False
+            for j in range(cur):
+                samples[write_pos + j] = _coerce_sample_value(statistic(X_arr, y_perm_batch[j]), backend)
+            write_pos += cur
+    else:
+        vectorized_mode = None
+        write_pos = 0
+        for y_perm_batch in _iter_labelwise_permuted_y_batches(
+            rng,
+            y_arr,
+            permutation_state,
+            n_perm,
+            backend_name,
+            device=rng_device,
+        ):
+            cur = int(y_perm_batch.shape[0])
+            if fastpath_hint == "pearson_corr":
+                corr_batch = _pearson_corr_with_y_batch(x_vec_fast, y_perm_batch, backend)
+                samples[write_pos : write_pos + cur] = _coerce_vectorized_values(corr_batch, cur, backend)
+                write_pos += cur
+                continue
+            if vectorized_mode is not False:
+                vec_values = _try_vectorized_statistic(
+                    statistic,
+                    cur,
+                    backend,
+                    X_arr,
+                    y_perm_batch,
+                )
+                if vec_values is not None:
+                    samples[write_pos : write_pos + cur] = vec_values
+                    vectorized_mode = True
+                    write_pos += cur
+                    continue
+                if vectorized_mode is None:
+                    if force_vectorized:
+                        raise ValueError(
+                            "force_vectorized=True but statistic did not return "
+                            "a vector of length batch_size"
+                        )
+                    vectorized_mode = False
+            for j in range(cur):
+                samples[write_pos + j] = _coerce_sample_value(statistic(X_arr, y_perm_batch[j]), backend)
+            write_pos += cur
+    if alt == "two-sided":
+        numerator = _to_float_scalar(backend.xp.sum(backend.xp.abs(samples) >= abs(observed)))
+    elif alt == "greater":
+        numerator = _to_float_scalar(backend.xp.sum(samples >= observed))
+    else:
+        numerator = _to_float_scalar(backend.xp.sum(samples <= observed))
+    pvalue = float((numerator + 1.0) / (n_perm + 1.0))
+    return PermutationTestResult(
+        statistic_name=str(statistic_name),
+        strategy=str(strategy).lower(),
+        alternative=alt,
+        observed=observed,
+        samples=samples,
+        pvalue=pvalue,
+        n_resamples=n_perm,
+        random_state=random_state,
+        metadata={"n_samples": int(y_arr.shape[0]), "backend": backend_name},
+    )