PyPI - statgpu - Versions diffs - 0.1.0__py3-none-any.whl - Mend

statgpu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (168) hide show

statgpu/__init__.py +174 -0
statgpu/_base.py +544 -0
statgpu/_config.py +127 -0
statgpu/anova/__init__.py +5 -0
statgpu/anova/_oneway.py +194 -0
statgpu/backends/__init__.py +83 -0
statgpu/backends/_array_ops.py +529 -0
statgpu/backends/_base.py +184 -0
statgpu/backends/_cupy.py +453 -0
statgpu/backends/_factory.py +65 -0
statgpu/backends/_gpu_inference_cupy.py +214 -0
statgpu/backends/_gpu_inference_torch.py +422 -0
statgpu/backends/_numpy.py +324 -0
statgpu/backends/_torch.py +685 -0
statgpu/backends/_torch_safe.py +47 -0
statgpu/backends/_utils.py +423 -0
statgpu/core/__init__.py +10 -0
statgpu/core/formula/__init__.py +33 -0
statgpu/core/formula/_design.py +99 -0
statgpu/core/formula/_parser.py +191 -0
statgpu/core/formula/_terms.py +70 -0
statgpu/core/formula/tests/__init__.py +0 -0
statgpu/core/formula/tests/test_parser.py +194 -0
statgpu/covariance/__init__.py +6 -0
statgpu/covariance/_empirical.py +310 -0
statgpu/covariance/_shrinkage.py +248 -0
statgpu/cross_validation/__init__.py +31 -0
statgpu/cross_validation/_base.py +410 -0
statgpu/cross_validation/_engine.py +167 -0
statgpu/diagnostics/__init__.py +7 -0
statgpu/diagnostics/_regression_diagnostics.py +188 -0
statgpu/feature_selection/__init__.py +24 -0
statgpu/feature_selection/_knockoff.py +870 -0
statgpu/feature_selection/_knockoff_utils.py +1003 -0
statgpu/feature_selection/_stepwise.py +300 -0
statgpu/glm_core/__init__.py +81 -0
statgpu/glm_core/_base.py +202 -0
statgpu/glm_core/_family.py +362 -0
statgpu/glm_core/_fused.py +149 -0
statgpu/glm_core/_gamma.py +111 -0
statgpu/glm_core/_inverse_gaussian.py +62 -0
statgpu/glm_core/_irls.py +561 -0
statgpu/glm_core/_logistic.py +82 -0
statgpu/glm_core/_negative_binomial.py +68 -0
statgpu/glm_core/_poisson.py +60 -0
statgpu/glm_core/_solver_legacy.py +100 -0
statgpu/glm_core/_squared.py +53 -0
statgpu/glm_core/_tweedie.py +74 -0
statgpu/inference/__init__.py +239 -0
statgpu/inference/_distributions_backend.py +2610 -0
statgpu/inference/_multiple_testing.py +391 -0
statgpu/inference/_resampling.py +1400 -0
statgpu/inference/_results.py +265 -0
statgpu/linear_model/__init__.py +75 -0
statgpu/linear_model/_gaussian_inference.py +306 -0
statgpu/linear_model/_glm_base.py +1261 -0
statgpu/linear_model/_ordered_logit.py +52 -0
statgpu/linear_model/_ordered_probit.py +50 -0
statgpu/linear_model/_stats.py +170 -0
statgpu/linear_model/cv/__init__.py +13 -0
statgpu/linear_model/cv/_elasticnet_cv.py +892 -0
statgpu/linear_model/cv/_lasso_cv.py +253 -0
statgpu/linear_model/cv/_logistic_cv.py +895 -0
statgpu/linear_model/cv/_ridge_cv.py +1160 -0
statgpu/linear_model/legacy/__init__.py +1 -0
statgpu/linear_model/legacy/_distributions_legacy_gpu.py +340 -0
statgpu/linear_model/legacy/_elasticnet_legacy.py +936 -0
statgpu/linear_model/legacy/_lasso_legacy.py +4876 -0
statgpu/linear_model/legacy/_penalized_legacy.py +1174 -0
statgpu/linear_model/legacy/_ridge_legacy.py +863 -0
statgpu/linear_model/legacy/_solver_legacy.py +104 -0
statgpu/linear_model/penalized/__init__.py +25 -0
statgpu/linear_model/penalized/_base.py +437 -0
statgpu/linear_model/penalized/_fit_mixin.py +1877 -0
statgpu/linear_model/penalized/_inference_mixin.py +1179 -0
statgpu/linear_model/penalized/_penalized_cv.py +2699 -0
statgpu/linear_model/penalized/_penalized_gamma.py +86 -0
statgpu/linear_model/penalized/_penalized_inverse_gaussian.py +62 -0
statgpu/linear_model/penalized/_penalized_linear.py +236 -0
statgpu/linear_model/penalized/_penalized_logistic.py +100 -0
statgpu/linear_model/penalized/_penalized_negative_binomial.py +65 -0
statgpu/linear_model/penalized/_penalized_poisson.py +62 -0
statgpu/linear_model/penalized/_penalized_tweedie.py +65 -0
statgpu/linear_model/penalized/_predict_mixin.py +182 -0
statgpu/linear_model/wrappers/__init__.py +31 -0
statgpu/linear_model/wrappers/_adaptive_lasso.py +63 -0
statgpu/linear_model/wrappers/_elasticnet.py +75 -0
statgpu/linear_model/wrappers/_gamma.py +67 -0
statgpu/linear_model/wrappers/_inverse_gaussian.py +47 -0
statgpu/linear_model/wrappers/_lasso.py +2124 -0
statgpu/linear_model/wrappers/_linear.py +1127 -0
statgpu/linear_model/wrappers/_logistic.py +1435 -0
statgpu/linear_model/wrappers/_mcp.py +58 -0
statgpu/linear_model/wrappers/_negative_binomial.py +58 -0
statgpu/linear_model/wrappers/_poisson.py +48 -0
statgpu/linear_model/wrappers/_ridge.py +166 -0
statgpu/linear_model/wrappers/_scad.py +58 -0
statgpu/linear_model/wrappers/_tweedie.py +57 -0
statgpu/metrics/__init__.py +21 -0
statgpu/metrics/_classification.py +591 -0
statgpu/nonparametric/__init__.py +50 -0
statgpu/nonparametric/kernel_methods/__init__.py +25 -0
statgpu/nonparametric/kernel_methods/_kernels.py +246 -0
statgpu/nonparametric/kernel_methods/_krr.py +234 -0
statgpu/nonparametric/kernel_methods/_krr_cv.py +380 -0
statgpu/nonparametric/kernel_smoothing/__init__.py +39 -0
statgpu/nonparametric/kernel_smoothing/_bandwidth_selection.py +1083 -0
statgpu/nonparametric/kernel_smoothing/_kde.py +761 -0
statgpu/nonparametric/kernel_smoothing/_kernel_common.py +348 -0
statgpu/nonparametric/kernel_smoothing/_kernel_regression.py +748 -0
statgpu/nonparametric/splines/__init__.py +5 -0
statgpu/nonparametric/splines/_bspline_basis.py +336 -0
statgpu/nonparametric/splines/_penalized.py +349 -0
statgpu/panel/__init__.py +19 -0
statgpu/panel/_covariance.py +140 -0
statgpu/panel/_fixed_effects.py +420 -0
statgpu/panel/_random_effects.py +385 -0
statgpu/panel/_utils.py +482 -0
statgpu/penalties/__init__.py +139 -0
statgpu/penalties/_adaptive_l1.py +313 -0
statgpu/penalties/_base.py +261 -0
statgpu/penalties/_categories.py +39 -0
statgpu/penalties/_elasticnet.py +98 -0
statgpu/penalties/_group_lasso.py +678 -0
statgpu/penalties/_group_mcp.py +553 -0
statgpu/penalties/_group_scad.py +605 -0
statgpu/penalties/_l1.py +107 -0
statgpu/penalties/_l2.py +77 -0
statgpu/penalties/_mcp.py +237 -0
statgpu/penalties/_scad.py +260 -0
statgpu/semiparametric/__init__.py +5 -0
statgpu/semiparametric/_gam.py +401 -0
statgpu/solvers/__init__.py +24 -0
statgpu/solvers/_admm.py +241 -0
statgpu/solvers/_constants.py +15 -0
statgpu/solvers/_convergence.py +6 -0
statgpu/solvers/_fista.py +436 -0
statgpu/solvers/_fista_bb.py +513 -0
statgpu/solvers/_fista_lla.py +541 -0
statgpu/solvers/_lbfgs.py +206 -0
statgpu/solvers/_newton.py +149 -0
statgpu/solvers/_utils.py +277 -0
statgpu/survival/__init__.py +14 -0
statgpu/survival/_cox.py +3974 -0
statgpu/survival/_cox_breslow_triton_kernel.py +106 -0
statgpu/survival/_cox_cv.py +1159 -0
statgpu/survival/_cox_efron_cuda.py +1280 -0
statgpu/survival/_cox_efron_triton.py +359 -0
statgpu/unsupervised/__init__.py +29 -0
statgpu/unsupervised/_agglomerative.py +307 -0
statgpu/unsupervised/_dbscan.py +263 -0
statgpu/unsupervised/_dbscan_cpu.pyx +125 -0
statgpu/unsupervised/_gmm.py +332 -0
statgpu/unsupervised/_incremental_pca.py +176 -0
statgpu/unsupervised/_kmeans.py +261 -0
statgpu/unsupervised/_minibatch_kmeans.py +299 -0
statgpu/unsupervised/_minibatch_nmf.py +252 -0
statgpu/unsupervised/_nmf.py +190 -0
statgpu/unsupervised/_pca.py +189 -0
statgpu/unsupervised/_truncated_svd.py +132 -0
statgpu/unsupervised/_tsne.py +192 -0
statgpu/unsupervised/_umap.py +224 -0
statgpu/unsupervised/_utils.py +134 -0
statgpu-0.1.0.dist-info/METADATA +245 -0
statgpu-0.1.0.dist-info/RECORD +168 -0
statgpu-0.1.0.dist-info/WHEEL +5 -0
statgpu-0.1.0.dist-info/licenses/LICENSE +199 -0
statgpu-0.1.0.dist-info/top_level.txt +1 -0

statgpu/inference/_distributions_backend.py ADDED Viewed

@@ -0,0 +1,2610 @@
+"""
+Unified probability-distribution backend.
+Supports ``numpy``, ``cupy``, and ``torch`` backends through a single
+``SpecialFunctions`` protocol, eliminating code duplication across
+``_distributions_gpu.py`` and ``_distributions_torch.py``.
+Usage::
+    from statgpu.inference._distributions_backend import get_distribution, norm, t
+    # Explicit backend
+    norm_dist = get_distribution("norm", backend="numpy")
+    norm_dist.cdf([0.0, 1.0, 2.0])
+    # Module-level proxy with auto backend detection
+    norm.cdf([0.0, 1.0, 2.0])
+    t.cdf(1.5, df=10, backend="cupy")
+"""
+from __future__ import annotations
+import math
+from abc import abstractmethod
+from functools import lru_cache
+from typing import Any, Protocol, runtime_checkable
+import numpy as np
+from statgpu.backends import _get_torch_device_str as _get_torch_device
+# =============================================================================
+# SpecialFunctions protocol — abstracts away library-specific special functions
+# =============================================================================
+@runtime_checkable
+class SpecialFunctions(Protocol):
+    """Protocol for special-function providers.
+    Implementations: ``CuPySpecialFunctions``, ``TorchSpecialFunctions``,
+    ``ScipySpecialFunctions``.
+    """
+    @abstractmethod
+    def betainc(self, a, b, x):
+        """Regularized incomplete beta I_x(a, b)."""
+    @abstractmethod
+    def betaincinv(self, a, b, y):
+        """Inverse regularized incomplete beta."""
+    @abstractmethod
+    def gammainc(self, a, x):
+        """Regularized lower incomplete gamma P(a, x)."""
+    @abstractmethod
+    def gammaincc(self, a, x):
+        """Regularized upper incomplete gamma Q(a, x)."""
+    @abstractmethod
+    def gammaincinv(self, a, q):
+        """Inverse regularized lower incomplete gamma."""
+    @abstractmethod
+    def gammaln(self, x):
+        """Log-gamma."""
+    @abstractmethod
+    def erf(self, x):
+        """Error function."""
+    @abstractmethod
+    def erfc(self, x):
+        """Complementary error function."""
+    @abstractmethod
+    def erfcinv(self, y):
+        """Inverse complementary error function."""
+# =============================================================================
+# CuPy backend
+# =============================================================================
+class CuPySpecialFunctions:
+    """Special functions via cupyx.scipy.special with LUT acceleration.
+    Inverse special functions (betaincinv, gammaincinv) use GPU-resident LUT
+    + 1-step Newton refinement for ~10-100x speedup over cupyx iterative solver.
+    """
+    def __init__(self, *, use_lut: bool = True):
+        import cupy as cp
+        import cupyx.scipy.special as csp
+        self._cp = cp
+        self._csp = csp
+        self.use_lut = use_lut
+        # LUT caches for inverse special functions (instance-level)
+        self._betaincinv_lut = {}
+        self._gammaincinv_lut = {}
+    def betainc(self, a, b, x):
+        return self._csp.betainc(a, b, self._cp.asarray(x, dtype=self._cp.float64))
+    def betaincinv(self, a, b, y):
+        cp = self._cp
+        yt = cp.asarray(y, dtype=cp.float64)
+        try:
+            af, bf = float(a), float(b)
+        except (TypeError, ValueError):
+            return self._csp.betaincinv(a, b, yt)
+        if not self.use_lut:
+            return self._csp.betaincinv(a, b, yt)
+        if af < 0.3 or bf < 0.3 or af > 50 or bf > 50 or abs(af - bf) > 30:
+            return self._csp.betaincinv(a, b, yt)
+        key = (af, bf)
+        if key not in self._betaincinv_lut:
+            x_grid, y_grid = self._build_betaincinv_lut(af, bf, 20000)
+            self._betaincinv_lut[key] = (cp.asarray(x_grid), cp.asarray(y_grid))
+        yg, xg = self._betaincinv_lut[key]
+        idx = cp.searchsorted(yg, cp.clip(yt, 1e-15, 1.0 - 1e-15)).clip(1, len(yg) - 1)
+        y0, y1 = yg[idx - 1], yg[idx]
+        x0, x1 = xg[idx - 1], xg[idx]
+        w = (yt - y0) / (y1 - y0 + 1e-300)
+        x = cp.clip(x0 + w * (x1 - x0), 1e-10, 1.0 - 1e-10)
+        # 1-step Newton refine using cupyx betainc
+        import math as _math
+        log_beta = _math.lgamma(af) + _math.lgamma(bf) - _math.lgamma(af + bf)
+        p = self._csp.betainc(af, bf, x)
+        diff = p - yt
+        log_deriv = (af - 1.0) * cp.log(cp.clip(x, 1e-300, None)) + \
+                    (bf - 1.0) * cp.log(cp.clip(1.0 - x, 1e-300, None)) - log_beta
+        deriv = cp.exp(log_deriv)
+        x1 = x - diff / cp.clip(deriv, 1e-300, 1e300)
+        return cp.clip(x1, 1e-15, 1.0 - 1e-15)
+    @staticmethod
+    def _build_betaincinv_lut(a, b, n_grid):
+        """Build LUT via scipy on CPU, returns (x_grid, y_grid) as numpy arrays.
+        Uses log spacing near both boundaries for better precision when
+        a or b is small (e.g. b=0.5 for t/f distributions).
+        """
+        import scipy.special as _scsp
+        eps = 1e-15
+        n_edge = int(n_grid * 0.4)
+        n_mid = n_grid - 2 * n_edge
+        x_lo = np.logspace(np.log10(eps), np.log10(0.01), n_edge)
+        x_mid = np.linspace(0.01, 0.99, n_mid + 2)[1:-1]
+        x_hi = 1.0 - np.logspace(np.log10(eps), np.log10(0.01), n_edge)[::-1]
+        x_grid = np.concatenate([x_lo, x_mid, x_hi])
+        if len(x_grid) > n_grid:
+            x_grid = x_grid[:n_grid]
+        y_grid = _scsp.betainc(a, b, x_grid)
+        return x_grid, y_grid
+    def gammainc(self, a, x):
+        return self._csp.gammainc(
+            self._cp.asarray(a, dtype=self._cp.float64),
+            self._cp.asarray(x, dtype=self._cp.float64),
+        )
+    def gammaincc(self, a, x):
+        return self._csp.gammaincc(
+            self._cp.asarray(a, dtype=self._cp.float64),
+            self._cp.asarray(x, dtype=self._cp.float64),
+        )
+    def gammaincinv(self, a, q):
+        cp = self._cp
+        qt = cp.asarray(q, dtype=cp.float64)
+        try:
+            af = float(a)
+        except (TypeError, ValueError):
+            return self._csp.gammaincinv(cp.asarray(a, dtype=cp.float64), qt)
+        if not self.use_lut:
+            return self._csp.gammaincinv(cp.asarray(a, dtype=cp.float64), qt)
+        if af < 1.0:
+            return self._csp.gammaincinv(cp.asarray(a, dtype=cp.float64), qt)
+        key = (af,)
+        if key not in self._gammaincinv_lut:
+            x_grid, y_grid = self._build_gammaincinv_lut(af, 20000)
+            self._gammaincinv_lut[key] = (cp.asarray(x_grid), cp.asarray(y_grid))
+        yg, xg = self._gammaincinv_lut[key]
+        idx = cp.searchsorted(yg, cp.clip(qt, 1e-15, 1.0 - 1e-15)).clip(1, len(yg) - 1)
+        y0, y1 = yg[idx - 1], yg[idx]
+        x0, x1 = xg[idx - 1], xg[idx]
+        w = (qt - y0) / (y1 - y0 + 1e-300)
+        x = cp.clip(x0 + w * (x1 - x0), 1e-15, 1e6)
+        # 1-step Newton refine using cupyx gammainc
+        import math as _math
+        log_ga = _math.lgamma(af)
+        p = self._csp.gammainc(af, x)
+        diff = p - qt
+        log_deriv = (af - 1.0) * cp.log(cp.clip(x, 1e-300, None)) - x - log_ga
+        deriv = cp.exp(log_deriv)
+        x1 = x - diff / cp.clip(deriv, 1e-300, 1e300)
+        return cp.clip(x1, 1e-15, 1e6)
+    @staticmethod
+    def _build_gammaincinv_lut(a, n_grid):
+        """Build LUT via scipy on CPU, returns (x_grid, y_grid) as numpy arrays."""
+        import math
+        import scipy.special as _scsp
+        x_max = a + 20 * math.sqrt(max(a, 0.1)) + 10
+        x_max = min(x_max, 1e6)
+        n_log = n_grid // 3
+        n_lin = n_grid - n_log
+        x_lo = np.logspace(-15, math.log10(max(x_max, 1e-10)), n_log, endpoint=False)
+        x_hi = np.linspace(x_lo[-1] if len(x_lo) > 0 else 0, x_max, n_lin + 1)[1:]
+        x_grid = np.concatenate([x_lo, x_hi])
+        if len(x_grid) < n_grid:
+            extra = np.linspace(x_grid[-1], x_max, n_grid - len(x_grid) + 2)[1:]
+            x_grid = np.concatenate([x_grid, extra])
+        x_grid = x_grid[:n_grid]
+        y_grid = _scsp.gammainc(a, x_grid)
+        y_grid[0] = 0.0
+        y_grid[-1] = 1.0
+        return x_grid, y_grid
+    def gammaln(self, x):
+        return self._csp.gammaln(self._cp.asarray(x, dtype=self._cp.float64))
+    def erf(self, x):
+        return self._csp.erf(self._cp.asarray(x, dtype=self._cp.float64))
+    def erfc(self, x):
+        return self._csp.erfc(self._cp.asarray(x, dtype=self._cp.float64))
+    def erfcinv(self, y):
+        return self._csp.erfcinv(self._cp.asarray(y, dtype=self._cp.float64))
+    def sqrt(self, x):
+        return self._cp.sqrt(self._cp.asarray(x, dtype=self._cp.float64))
+    @property
+    def pi(self):
+        return self._cp.pi
+    def clip(self, x, lo, hi):
+        return self._cp.clip(x, lo, hi)
+    def where(self, cond, x, y):
+        return self._cp.where(cond, x, y)
+    def as_float64(self, x):
+        return self._cp.asarray(x, dtype=self._cp.float64)
+# =============================================================================
+# Torch backend
+# =============================================================================
+# Module-level cache for torch betaincinv inverse LUTs (scalar a, b)
+# Key: (a, b, device) -> (y_grid, x_grid) tensors on device
+_torch_betaincinv_lut_cache: dict = {}
+# Module-level cache for torch betainc forward LUTs (scalar a, b)
+# Key: (a, b, device) -> (x_grid, y_grid) tensors on device
+_torch_betainc_lut_cache: dict = {}
+def _get_torch_betaincinv_lut(a, b, device, n_points=20000):
+    """Build a GPU-resident inverse LUT for betaincinv(a, b, y).
+    Precomputes x = betaincinv(a, b, y) for 20K y values via scipy on CPU
+    (one-time cost, <200ms) then uses searchsorted for O(log n) lookup.
+    """
+    from scipy import special as _scsp
+    import torch
+    cache_key = (a, b, device)
+    if cache_key in _torch_betaincinv_lut_cache:
+        return _torch_betaincinv_lut_cache[cache_key]
+    y_vals = np.linspace(1e-15, 1.0 - 1e-15, n_points)
+    x_vals = _scsp.betaincinv(a, b, y_vals)
+    y_grid = torch.as_tensor(y_vals, dtype=torch.float64, device=device)
+    x_grid = torch.as_tensor(x_vals, dtype=torch.float64, device=device)
+    _torch_betaincinv_lut_cache[cache_key] = (y_grid, x_grid)
+    return y_grid, x_grid
+def _get_torch_betainc_lut(a, b, device, n_points=40000):
+    """Build a GPU-resident forward LUT for betainc(a, b, x).
+    Precomputes y = betainc(a, b, x) for 40K x values via scipy on CPU
+    (one-time cost, <50ms) then uses searchsorted for O(log n) lookup.
+    Uses log spacing near boundaries for better precision when a or b is small.
+    """
+    from scipy import special as _scsp
+    import torch
+    cache_key = (a, b, device)
+    if cache_key in _torch_betainc_lut_cache:
+        return _torch_betainc_lut_cache[cache_key]
+    # Log spacing near boundaries for b < 1 singularity
+    eps = 1e-15
+    n_edge = int(n_points * 0.4)
+    n_mid = n_points - 2 * n_edge
+    x_lo = np.logspace(np.log10(eps), np.log10(0.01), n_edge)
+    x_mid = np.linspace(0.01, 0.99, n_mid + 2)[1:-1]
+    x_hi = 1.0 - np.logspace(np.log10(eps), np.log10(0.01), n_edge)[::-1]
+    x_vals = np.concatenate([x_lo, x_mid, x_hi])[:n_points]
+    y_vals = _scsp.betainc(a, b, x_vals)
+    x_grid = torch.as_tensor(x_vals, dtype=torch.float64, device=device)
+    y_grid = torch.as_tensor(y_vals, dtype=torch.float64, device=device)
+    _torch_betainc_lut_cache[cache_key] = (x_grid, y_grid)
+    return x_grid, y_grid
+class TorchSpecialFunctions:
+    """Special functions via torch.special with fallbacks for missing functions."""
+    def __init__(self, device: str | None = None, *, use_lut: bool = True):
+        import torch
+        self._torch = torch
+        self._device = device or _get_torch_device()
+        self.use_lut = use_lut
+    def _as_tensor(self, x):
+        return self._torch.as_tensor(x, dtype=self._torch.float64, device=self._device)
+    # ── betainc fallback ───────────────────────────────────────────
+    def betainc(self, a, b, x):
+        t = self._torch
+        # Check if torch has native betainc (>= 1.8)
+        if hasattr(t.special, "betainc"):
+            return t.special.betainc(
+                self._as_tensor(a), self._as_tensor(b), self._as_tensor(x),
+            )
+        # LUT-based betainc for scalar a, b (major speedup for binom)
+        try:
+            af, bf = float(a), float(b)
+        except (TypeError, ValueError):
+            pass  # fall through to element-wise loop below
+        else:
+            if self.use_lut:
+                try:
+                    xg, yg = _get_torch_betainc_lut(af, bf, self._device)
+                    xt = self._as_tensor(x)
+                    xt_clamp = t.clamp(xt, 0.0, 1.0)
+                    idx = t.searchsorted(xg, xt_clamp).clip(1, len(xg) - 1)
+                    x0, x1 = xg[idx - 1], xg[idx]
+                    y0, y1 = yg[idx - 1], yg[idx]
+                    w = (xt_clamp - x0) / (x1 - x0 + 1e-300)
+                    return (y0 + w * (y1 - y0)).clamp(0.0, 1.0).view_as(xt)
+                except Exception:
+                    pass  # fall through to integral fallback
+            return self._betainc_integral(af, bf, self._as_tensor(x))
+        # Non-scalar a or b — grouped LUT lookup (avoids element-wise Chebyshev integral)
+        try:
+            return self._betainc_batch(a, b, x)
+        except Exception:
+            # Full fallback: compute on CPU via scipy
+            try:
+                import scipy.special as _scsp
+                a_np = np.asarray(self._as_tensor(a).cpu().numpy())
+                b_np = np.asarray(self._as_tensor(b).cpu().numpy())
+                x_np = np.asarray(self._as_tensor(x).cpu().numpy())
+                result = _scsp.betainc(
+                    np.clip(a_np, 1, None).astype(int),
+                    np.clip(b_np, 1, None).astype(int),
+                    np.clip(x_np, 0.0, 1.0),
+                )
+                return self._as_tensor(result)
+            except Exception:
+                return self._betainc_integral(1, 1, self._as_tensor(x))
+    def _betainc_integral(self, a, b, x):
+        """Regularized incomplete beta via trapezoidal rule on Chebyshev-mapped grid.
+        Uses Chebyshev-node mapping to cluster grid points near s=0 and s=1.
+        """
+        import math as _math
+        t = self._torch
+        device = x.device
+        x = t.clamp(x, 0.0, 1.0)
+        af, bf = float(a), float(b)
+        if af < 1.0 or bf < 1.0:
+            n_grid = 64000
+        elif af < 5.0 or bf < 5.0:
+            n_grid = 16000
+        else:
+            n_grid = 8000
+        theta = t.linspace(0, _math.pi, n_grid, device=device, dtype=t.float64)
+        s = 0.5 * (1.0 + t.cos(theta))  # descending [≈1, 0]
+        s = s.flip(0)  # ascending [0, ≈1]
+        eps = 1e-14
+        log_val = (a - 1) * t.log(s + 1e-300) + (b - 1) * t.log1p(-s + 1e-300)
+        log_val = t.where(t.isfinite(log_val), log_val, t.tensor(-700.0, dtype=t.float64, device=device))
+        f = t.exp(log_val)
+        beta_ab = _math.exp(_math.lgamma(af) + _math.lgamma(bf) - _math.lgamma(af + bf))
+        ds = s[1:] - s[:-1]
+        cum = t.zeros(n_grid, device=device, dtype=t.float64)
+        cum[1:] = t.cumsum((f[:-1] + f[1:]) * 0.5 * ds, dim=0)
+        x_flat = x.flatten()
+        idx = t.searchsorted(s, x_flat, right=True).clamp(1, n_grid - 1)
+        frac = (x_flat - s[idx - 1]) / (s[idx] - s[idx - 1] + 1e-300)
+        frac = frac.clamp(0.0, 1.0)
+        result = cum[idx - 1] + frac * (cum[idx] - cum[idx - 1])
+        result = result / beta_ab
+        result = t.clamp(result, 0.0, 1.0)
+        result = t.where(x_flat <= eps, 0.0, result)
+        result = t.where(x_flat >= 1 - eps, 1.0, result)
+        return result.view_as(x)
+    def _betainc_batch(self, a, b, x):
+        """Batch betainc for non-scalar a, b via fused 2D-LUT interpolation.
+        All LUTs share the same x-grid (fixed log-spaced scheme), so we:
+        1. Build a 2D y-grid of shape (n_pairs, n_grid) for all unique (a,b) pairs
+        2. Call searchsorted ONCE to find the bracket index for all elements
+        3. Interpolate all pairs simultaneously via batched gather
+        4. Scatter results back to output positions
+        This avoids 100+ separate searchsorted calls, reducing overhead by ~100x.
+        """
+        x_flat = self._as_tensor(x).flatten()
+        a_flat = self._as_tensor(a).flatten()
+        b_flat = self._as_tensor(b).flatten()
+        t = self._torch
+        # Clamp to >= 1 for key encoding (edge cases get overwritten by caller)
+        ai = t.clamp(t.round(a_flat).long(), 1, 100000)
+        bi = t.clamp(t.round(b_flat).long(), 1, 100000)
+        # Encode as single key for unique computation
+        keys = ai * 100000 + bi
+        unique_keys, inverse_idx = t.unique(keys, return_inverse=True)
+        n_pairs = unique_keys.numel()
+        n_elem = x_flat.numel()
+        # Build 2D grid: (n_pairs, n_grid)
+        # All LUTs share the same x-grid, so we only need one
+        y_grid = t.zeros((n_pairs, 40000), dtype=t.float64, device=self._device)
+        xg = None
+        n_actual = 0
+        failed_pairs = []
+        for pi in range(n_pairs):
+            k_val = unique_keys[pi].item()
+            a_val = k_val // 100000
+            b_val = k_val - a_val * 100000
+            try:
+                xg_i, yg_i = _get_torch_betainc_lut(a_val, b_val, self._device)
+                if xg is None:
+                    xg = xg_i  # all LUTs share the same x-grid
+                    n_actual = len(xg_i)
+                y_grid[pi, :len(yg_i)] = yg_i
+            except Exception:
+                failed_pairs.append((pi, float(a_val), float(b_val)))
+        if xg is None:
+            # All LUTs failed, fall back
+            return self._betainc_integral(1, 1, x_flat)
+        xg = xg[:n_actual]
+        y_grid = y_grid[:, :n_actual]
+        # Single searchsorted for all elements
+        x_clamp = t.clamp(x_flat, 0.0, 1.0)
+        sidx = t.searchsorted(xg, x_clamp).clip(1, n_actual - 1)
+        # Interpolation weights (same for all pairs)
+        x0g, x1g = xg[sidx - 1], xg[sidx]
+        w = (x_clamp - x0g) / (x1g - x0g + 1e-300)
+        # Gather y0/y1 for all pairs simultaneously: (n_pairs, n_elem)
+        y0_all = y_grid[:, sidx - 1]  # (n_pairs, n_elem)
+        y1_all = y_grid[:, sidx]      # (n_pairs, n_elem)
+        y_all = y0_all + w.unsqueeze(0) * (y1_all - y0_all)  # (n_pairs, n_elem)
+        y_all = y_all.clamp(0.0, 1.0)
+        # Scatter: select the right pair index for each element
+        # inverse_idx: (n_elem,) → indices into pair dimension
+        # y_all: (n_pairs, n_elem) → gather along dim=0
+        result = y_all[inverse_idx, t.arange(n_elem, device=self._device)]
+        if failed_pairs:
+            for pi, a_val, b_val in failed_pairs:
+                mask = inverse_idx == pi
+                if t.any(mask):
+                    result[mask] = self._betainc_integral(a_val, b_val, x_clamp[mask])
+        return result.view(self._as_tensor(a).shape)
+    def betaincinv(self, a, b, y):
+        t = self._torch
+        af, bf = float(a), float(b)
+        yt = self._as_tensor(y)
+        if hasattr(t.special, "betaincinv"):
+            return t.special.betaincinv(
+                self._as_tensor(a), self._as_tensor(b), yt,
+            )
+        # For scalar a, b: LUT lookup + 1-step Newton refine
+        if not self.use_lut:
+            return self._betaincinv_newton(af, bf, yt)
+        try:
+            y_grid, x_grid = _get_torch_betaincinv_lut(af, bf, yt.device)
+            # Searchsorted to find bracket index
+            idx = t.searchsorted(y_grid, t.clamp(yt, 0.0, 1.0)).clamp(1, len(y_grid) - 1)
+            # Interpolate between two nearest LUT points
+            y0, y1 = y_grid[idx - 1], y_grid[idx]
+            x0, x1 = x_grid[idx - 1], x_grid[idx]
+            w = (yt - y0) / (y1 - y0 + 1e-300)
+            x = x0 + w * (x1 - x0)
+            x = t.clamp(x, 1e-10, 1.0 - 1e-10)
+            # 1-step Newton refine
+            import math as _math
+            beta_ab = math.exp(math.lgamma(af) + math.lgamma(bf) - math.lgamma(af + bf))
+            val = self._betainc_integral(af, bf, x)
+            deriv = t.pow(t.clamp(x, 1e-300, 1 - 1e-300), af - 1) * \
+                    t.pow(t.clamp(1 - x, 1e-300, 1 - 1e-300), bf - 1) / beta_ab
+            deriv = t.clamp(deriv, 1e-300, 1e300)
+            step = (val - yt) / deriv
+            x = x - step
+            x = t.clamp(x, 1e-10, 1.0 - 1e-10)
+            return x
+        except Exception:
+            return self._betaincinv_newton(af, bf, yt)
+    def _betaincinv_newton(self, a, b, y):
+        """Inverse regularized incomplete beta via damped Newton-Raphson."""
+        t = self._torch
+        device = y.device
+        y = t.clamp(y, 1e-15, 1 - 1e-15)
+        import math as _math
+        # Logit-normal approximation for initial guess
+        import scipy.special as _scsp
+        mu = _scsp.digamma(a) - _scsp.digamma(b)
+        sigma2 = 1.0 / a + 1.0 / b
+        sigma = math.sqrt(sigma2)
+        z = -_math.sqrt(2.0) * self.erfcinv(2.0 * y)
+        z = self._as_tensor(z) if not isinstance(z, t.Tensor) else z
+        logit_q = mu + sigma * z
+        x = 1.0 / (1.0 + t.exp(-logit_q))
+        x = t.clamp(x, 1e-10, 1.0 - 1e-10)
+        # Damped Newton refinement
+        beta_ab = math.exp(math.lgamma(a) + math.lgamma(b) - math.lgamma(a + b))
+        for _ in range(50):
+            val = self._betainc_integral(a, b, x)
+            diff = val - y
+            if t.max(t.abs(diff)) < 1e-13:
+                break
+            deriv = t.pow(t.clamp(x, 1e-300, 1 - 1e-300), a - 1) * \
+                    t.pow(t.clamp(1 - x, 1e-300, 1 - 1e-300), b - 1) / beta_ab
+            deriv = t.clamp(deriv, 1e-300, 1e300)
+            step = diff / deriv
+            # Damped: backtracking to keep x in valid range
+            for _ in range(20):
+                x_new = x - step
+                if t.min(x_new) < 1e-15 or t.max(x_new) > 1.0 - 1e-15:
+                    step = step * 0.5
+                else:
+                    break
+            x = x - step
+            x = t.clamp(x, 1e-10, 1.0 - 1e-10)
+        return x
+    def gammainc(self, a, x):
+        return self._torch.special.gammainc(self._as_tensor(a), self._as_tensor(x))
+    def gammaincc(self, a, x):
+        return self._torch.special.gammaincc(self._as_tensor(a), self._as_tensor(x))
+    def gammaincinv(self, a, q):
+        t = self._torch
+        af = float(a)
+        qt = self._as_tensor(q)
+        if hasattr(t.special, "gammaincinv"):
+            return t.special.gammaincinv(self._as_tensor(a), qt)
+        return self._gammaincinv_newton(af, qt)
+    def _gammaincinv_newton(self, a, q):
+        """Inverse regularized lower incomplete gamma via damped Newton-Raphson."""
+        t = self._torch
+        device = q.device
+        q = t.clamp(q, 1e-15, 1 - 1e-15)
+        import math
+        at = t.tensor(a, dtype=t.float64, device=device)
+        # Wilson-Hilferty initial guess (much better than erfinv-based)
+        # For gamma(a,1): P(a,x) ≈ Φ((x/a)^(1/3) - (1 - 1/(9a))) / sqrt(1/(9a))
+        z = math.sqrt(2.0) * t.erfinv(2.0 * q - 1.0)
+        c = 1.0 - 1.0 / (9.0 * a)
+        s = 1.0 / math.sqrt(9.0 * a)
+        u = z * s + c
+        x = a * t.pow(u, 3.0)
+        x = t.clamp(x, 1e-10, 1e6)
+        lg_a = math.lgamma(a)
+        for _ in range(50):
+            val = t.special.gammainc(at, x)
+            diff = val - q
+            if t.max(t.abs(diff)) < 1e-13:
+                break
+            # derivative: d/dx P(a,x) = x^(a-1) * e^(-x) / Gamma(a)
+            log_deriv = (a - 1.0) * t.log(t.clamp(x, 1e-300, None)) - x - lg_a
+            deriv = t.exp(log_deriv)
+            deriv = t.clamp(deriv, 1e-300, 1e300)
+            step = diff / deriv
+            # Damped: backtracking line search to prevent oscillation
+            # Accept full step if it stays in bounds; otherwise halve
+            damped = False
+            for _ in range(20):
+                x_new = x - step
+                if t.min(x_new) < 1e-15 or t.max(x_new) > 2e6:
+                    step = step * 0.5
+                    damped = True
+                else:
+                    break
+            x = x - step
+            x = t.clamp(x, 1e-10, 1e6)
+        return x
+    def gammaln(self, x):
+        return self._torch.lgamma(self._as_tensor(x))
+    def erf(self, x):
+        return self._torch.erf(self._as_tensor(x))
+    def erfc(self, x):
+        return self._torch.erfc(self._as_tensor(x))
+    def erfcinv(self, y):
+        t = self._torch
+        yt = self._as_tensor(y)
+        if hasattr(t.special, "erfcinv"):
+            return t.special.erfcinv(yt)
+        # Fallback: erfcinv(y) = erfinv(1 - y)
+        return t.erfinv(1.0 - yt)
+    def sqrt(self, x):
+        return self._torch.sqrt(self._as_tensor(x))
+    @property
+    def pi(self):
+        return self._torch.tensor(math.pi, dtype=self._torch.float64, device=self._device)
+    def clip(self, x, lo, hi):
+        return self._torch.clamp(x, lo, hi)
+    def where(self, cond, x, y):
+        t = self._torch
+        # torch.where requires boolean condition tensor
+        if isinstance(cond, t.Tensor) and cond.dtype != t.bool:
+            cond = cond.to(dtype=t.bool)
+        return t.where(cond, x, y)
+    def as_float64(self, x):
+        return self._as_tensor(x)
+# =============================================================================
+# SciPy / NumPy backend
+# =============================================================================
+class ScipySpecialFunctions:
+    """Special functions via scipy.special (pure NumPy / CPU).
+    Inverse functions (gammaincinv, betaincinv) use cached LUT + interpolation
+    for ~100ms evaluation on 1M points (vs ~3000ms for scipy's iterative solver).
+    Accuracy: ~1e-5 for typical parameter ranges.
+    For edge-case parameters (extreme a, b), falls back to scipy for full accuracy.
+    """
+    def __init__(self, *, use_lut: bool = True):
+        import scipy.special as scsp
+        self._scsp = scsp
+        self.use_lut = use_lut
+        # LUT cache for inverse functions (scalar a/b cases, well-behaved parameters)
+        self._gammaincinv_lut = {}
+        self._betaincinv_lut = {}
+    @staticmethod
+    @lru_cache(maxsize=256)
+    def _make_gammaincinv_lut(a, n_grid):
+        """Build LUT: x_grid -> y = gammainc(a, x_grid)."""
+        import scipy.special as _scsp
+        x_max = a + 20 * math.sqrt(max(a, 0.1)) + 10
+        x_max = min(x_max, 1e6)
+        n_log = n_grid // 3
+        n_lin = n_grid - n_log
+        x_lo = np.logspace(-15, math.log10(max(x_max, 1e-10)), n_log, endpoint=False)
+        x_hi = np.linspace(x_lo[-1] if len(x_lo) > 0 else 0, x_max, n_lin + 1)[1:]
+        x_grid = np.concatenate([x_lo, x_hi])
+        if len(x_grid) < n_grid:
+            extra = np.linspace(x_grid[-1], x_max, n_grid - len(x_grid) + 2)[1:]
+            x_grid = np.concatenate([x_grid, extra])
+        x_grid = x_grid[:n_grid]
+        y_grid = _scsp.gammainc(a, x_grid)
+        y_grid[0] = 0.0
+        y_grid[-1] = 1.0
+        return x_grid, y_grid
+    @staticmethod
+    @lru_cache(maxsize=256)
+    def _make_betaincinv_lut(a, b, n_grid):
+        """Build LUT: x_grid -> y = betainc(a, b, x_grid).
+        Uses log spacing near both boundaries for better precision when
+        a or b is small (e.g. b=0.5 for t/f distributions).
+        """
+        import scipy.special as _scsp
+        eps = 1e-15
+        # Log spacing: 40% near each boundary, 20% in the middle
+        n_edge = int(n_grid * 0.4)
+        n_mid = n_grid - 2 * n_edge
+        x_lo = np.logspace(np.log10(eps), np.log10(0.01), n_edge)
+        x_mid = np.linspace(0.01, 0.99, n_mid + 2)[1:-1]
+        x_hi = 1.0 - np.logspace(np.log10(eps), np.log10(0.01), n_edge)[::-1]
+        x_grid = np.concatenate([x_lo, x_mid, x_hi])
+        if len(x_grid) > n_grid:
+            x_grid = x_grid[:n_grid]
+        y_grid = _scsp.betainc(a, b, x_grid)
+        return x_grid, y_grid
+    @staticmethod
+    def _inverse_lut(q_or_y, x_grid, y_grid):
+        """Use LUT for inverse: given q, find x such that f(x) = q."""
+        idx = np.searchsorted(y_grid, q_or_y, side='left').clip(1, len(y_grid) - 1)
+        frac = (q_or_y - y_grid[idx - 1]) / (y_grid[idx] - y_grid[idx - 1] + 1e-300)
+        frac = np.clip(frac, 0.0, 1.0)
+        return x_grid[idx - 1] + frac * (x_grid[idx] - x_grid[idx - 1])
+    def betainc(self, a, b, x):
+        return self._scsp.betainc(a, b, np.asarray(x, dtype=np.float64))
+    def betaincinv(self, a, b, y):
+        arr = np.asarray(y, dtype=np.float64)
+        try:
+            af, bf = float(a), float(b)
+        except (TypeError, ValueError):
+            return self._scsp.betaincinv(a, b, arr)
+        if not self.use_lut:
+            return self._scsp.betaincinv(af, bf, arr)
+        if af < 0.3 or bf < 0.3 or af > 50 or bf > 50 or abs(af - bf) > 30:
+            return self._scsp.betaincinv(af, bf, arr)
+        # LUT + 1-step Newton refinement
+        key = (af, bf)
+        if key not in self._betaincinv_lut:
+            x_grid, y_grid = self._make_betaincinv_lut(af, bf, 20000)
+            self._betaincinv_lut[key] = (x_grid, y_grid)
+        x_grid, y_grid = self._betaincinv_lut[key]
+        x0 = self._inverse_lut(arr, x_grid, y_grid)
+        # 1 step of Newton
+        log_beta = math.lgamma(af) + math.lgamma(bf) - math.lgamma(af + bf)
+        p = self._scsp.betainc(af, bf, x0)
+        diff = p - arr
+        log_deriv = (af - 1.0) * np.log(np.clip(x0, 1e-300, None)) + \
+                    (bf - 1.0) * np.log(np.clip(1.0 - x0, 1e-300, None)) - log_beta
+        deriv = np.exp(log_deriv)
+        x1 = x0 - diff / np.clip(deriv, 1e-300, 1e300)
+        return np.clip(x1, 1e-15, 1.0 - 1e-15)
+    def gammainc(self, a, x):
+        return self._scsp.gammainc(np.asarray(a, dtype=np.float64),
+                                    np.asarray(x, dtype=np.float64))
+    def gammaincc(self, a, x):
+        return self._scsp.gammaincc(np.asarray(a, dtype=np.float64),
+                                     np.asarray(x, dtype=np.float64))
+    def gammaincinv(self, a, q):
+        arr = np.asarray(q, dtype=np.float64)
+        try:
+            af = float(a)
+        except (TypeError, ValueError):
+            return self._scsp.gammaincinv(a, arr)
+        if not self.use_lut:
+            return self._scsp.gammaincinv(af, arr)
+        if af < 1.0:
+            return self._scsp.gammaincinv(af, arr)
+        # LUT + 1-step Newton refinement
+        key = (af,)
+        if key not in self._gammaincinv_lut:
+            x_grid, y_grid = self._make_gammaincinv_lut(af, 20000)
+            self._gammaincinv_lut[key] = (x_grid, y_grid)
+        x_grid, y_grid = self._gammaincinv_lut[key]
+        x0 = self._inverse_lut(arr, x_grid, y_grid)
+        # 1 step of Newton: x = x0 - (P(a, x0) - q) / P'(a, x0)
+        log_ga = math.lgamma(af)
+        p = self._scsp.gammainc(af, x0)
+        diff = p - arr
+        log_deriv = (af - 1.0) * np.log(np.clip(x0, 1e-300, None)) - x0 - log_ga
+        deriv = np.exp(log_deriv)
+        x1 = x0 - diff / np.clip(deriv, 1e-300, 1e300)
+        return np.clip(x1, 1e-15, 1e6)
+    def gammaln(self, x):
+        return self._scsp.gammaln(np.asarray(x, dtype=np.float64))
+    def erf(self, x):
+        return self._scsp.erf(np.asarray(x, dtype=np.float64))
+    def erfc(self, x):
+        return self._scsp.erfc(np.asarray(x, dtype=np.float64))
+    def erfcinv(self, y):
+        return self._scsp.erfcinv(np.asarray(y, dtype=np.float64))
+    def sqrt(self, x):
+        return np.sqrt(np.asarray(x, dtype=np.float64))
+    @property
+    def pi(self):
+        return np.pi
+    def clip(self, x, lo, hi):
+        return np.clip(x, lo, hi)
+    def where(self, cond, x, y):
+        return np.where(cond, x, y)
+    def as_float64(self, x):
+        return np.asarray(x, dtype=np.float64)
+# =============================================================================
+# Distribution base classes — parameterized by SpecialFunctions
+# =============================================================================
+_T_PPF_BISECT_LOWER = -64.0
+_T_PPF_BISECT_UPPER = 64.0
+class NormDistributionBase:
+    """scipy.stats.norm-like distribution, parameterized by SpecialFunctions."""
+    def __init__(self, sf: SpecialFunctions):
+        self._sf = sf
+    def _cdf_standard(self, x):
+        sf = self._sf
+        return 0.5 * (1.0 + sf.erf(x / sf.sqrt(2.0)))
+    def _sf_standard(self, x):
+        return sf_safe_mul(self._sf.erfc(x / self._sf.sqrt(2.0)), 0.5, self._sf)
+    def _ppf_standard(self, q):
+        return -self._sf.sqrt(2.0) * self._sf.erfcinv(2.0 * q)
+    def _isf_standard(self, q):
+        return self._ppf_standard(1.0 - q)
+    def _two_sided_pvalue_standard(self, stat_abs):
+        sf = self._sf
+        return sf.clip(2.0 * self._sf_standard(sf.as_float64(stat_abs)), 0.0, 1.0)
+    def _two_sided_critical_value_standard(self, alpha):
+        sf = self._sf
+        a = float(alpha)
+        if not (0.0 < a < 1.0):
+            return sf.as_float64(float("nan"))
+        return self._ppf_standard(1.0 - a / 2.0)
+    def cdf(self, x, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        scale_f = float(scale)
+        if scale_f <= 0:
+            return sf.where(sf.as_float64(x) * 0 + 1, float("nan"), float("nan"))
+        x_std = (sf.as_float64(x) - float(loc)) / scale_f
+        return self._cdf_standard(x_std)
+    def sf(self, x, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        scale_f = float(scale)
+        if scale_f <= 0:
+            return sf.where(sf.as_float64(x) * 0 + 1, float("nan"), float("nan"))
+        x_std = (sf.as_float64(x) - float(loc)) / scale_f
+        return self._sf_standard(x_std)
+    def ppf(self, q, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        q_f = sf.as_float64(q)
+        scale_f = float(scale)
+        if scale_f <= 0:
+            return sf.where(q_f * 0 + 1, float("nan"), float("nan"))
+        return float(loc) + scale_f * self._ppf_standard(q_f)
+    def isf(self, q, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        q_f = sf.as_float64(q)
+        scale_f = float(scale)
+        if scale_f <= 0:
+            return sf.where(q_f * 0 + 1, float("nan"), float("nan"))
+        return float(loc) + scale_f * self._isf_standard(q_f)
+    def pdf(self, x, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        scale_f = float(scale)
+        x_f = sf.as_float64(x)
+        if scale_f <= 0:
+            return sf.where(x_f * 0 + 1, float("nan"), float("nan"))
+        z = (x_f - float(loc)) / scale_f
+        norm_const = sf.sqrt(2.0 * sf.pi)
+        return sf.exp(-0.5 * sf.square(z)) / (scale_f * norm_const)
+    def two_sided_pvalue(self, stat_abs):
+        return self._two_sided_pvalue_standard(stat_abs)
+    def two_sided_critical_value(self, alpha):
+        return self._two_sided_critical_value_standard(alpha)
+    def rvs(self, *, size=None, loc=0.0, scale=1.0, dtype=None):
+        return _rvs_normal(self._sf, size=size, loc=loc, scale=scale)
+class TDistributionBase:
+    """scipy.stats.t-like distribution, parameterized by SpecialFunctions."""
+    def __init__(self, sf: SpecialFunctions):
+        self._sf = sf
+    def _cdf_standard(self, x, df):
+        sf = self._sf
+        df_val = float(df)
+        if df_val <= 0:
+            return sf.where(x * 0 + 1, float("nan"), float("nan"))
+        z = df_val / (df_val + sf.square(sf.abs(x)))
+        ibeta = sf.betainc(df_val / 2.0, 0.5, z)
+        lower_tail = 0.5 * ibeta
+        return sf.where(x >= 0.0, 1.0 - lower_tail, lower_tail)
+    def _sf_standard(self, x, df):
+        return sf_safe_sub(1.0, self._cdf_standard(x, df), self._sf)
+    def _two_sided_pvalue_standard(self, stat_abs, df):
+        sf = self._sf
+        df_val = float(df)
+        if df_val <= 0:
+            return sf.where(stat_abs * 0 + 1, float("nan"), float("nan"))
+        z = df_val / (df_val + sf.square(sf.abs(stat_abs)))
+        return sf.betainc(df_val / 2.0, 0.5, z)
+    def _ppf_standard(self, q, df, *, max_bisect_steps=60):
+        sf = self._sf
+        df_val = float(df)
+        if df_val <= 0:
+            return sf.where(sf.as_float64(q) * 0 + 1, float("nan"), float("nan"))
+        q_f = sf.as_float64(q)
+        out = sf.where(q_f * 0 + 1, float("nan"), float("nan"))
+        out = sf.where(q_f == 0.0, -float("inf"), out)
+        out = sf.where(q_f == 1.0, float("inf"), out)
+        valid = (q_f > 0.0) & (q_f < 1.0)
+        if not bool(sf.any(valid)):
+            return out
+        try:
+            tail = sf.minimum(q_f, 1.0 - q_f)
+            y = 2.0 * tail
+            y_inv = sf.betaincinv(df_val / 2.0, 0.5, y)
+            x_pos = sf.sqrt(df_val * (1.0 - y_inv) / y_inv)
+            quant = sf.where(q_f >= 0.5, x_pos, -x_pos)
+            return sf.where(valid, quant, out)
+        except Exception:
+            return self._ppf_bisect(q_f, df_val, valid, out, max_bisect_steps)
+    def _ppf_bisect(self, q, df_val, valid, out, steps):
+        sf = self._sf
+        lo = sf.where(q * 0 + 1, _T_PPF_BISECT_LOWER, _T_PPF_BISECT_LOWER)
+        hi = sf.where(q * 0 + 1, _T_PPF_BISECT_UPPER, _T_PPF_BISECT_UPPER)
+        for _ in range(max(int(steps), 1)):
+            mid = 0.5 * (lo + hi)
+            cdf_mid = self._cdf_standard(mid, df_val)
+            go_right = cdf_mid < q
+            lo = sf.where(go_right, mid, lo)
+            hi = sf.where(go_right, hi, mid)
+        quant = 0.5 * (lo + hi)
+        return sf.where(valid, quant, out)
+    def cdf(self, x, df, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        scale_f = float(scale)
+        if scale_f <= 0:
+            return sf.where(sf.as_float64(x) * 0 + 1, float("nan"), float("nan"))
+        x_std = (sf.as_float64(x) - float(loc)) / scale_f
+        return self._cdf_standard(x_std, df)
+    def sf(self, x, df, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        scale_f = float(scale)
+        if scale_f <= 0:
+            return sf.where(sf.as_float64(x) * 0 + 1, float("nan"), float("nan"))
+        x_std = (sf.as_float64(x) - float(loc)) / scale_f
+        return self._sf_standard(x_std, df)
+    def ppf(self, q, df, *, loc=0.0, scale=1.0, max_bisect_steps=60):
+        sf = self._sf
+        scale_f = float(scale)
+        if scale_f <= 0:
+            return sf.where(sf.as_float64(q) * 0 + 1, float("nan"), float("nan"))
+        return float(loc) + scale_f * self._ppf_standard(q, df, max_bisect_steps=max_bisect_steps)
+    def isf(self, q, df, *, loc=0.0, scale=1.0, max_bisect_steps=60):
+        sf = self._sf
+        scale_f = float(scale)
+        if scale_f <= 0:
+            return sf.where(sf.as_float64(q) * 0 + 1, float("nan"), float("nan"))
+        return float(loc) + scale_f * self._ppf_standard(1.0 - sf.as_float64(q), df, max_bisect_steps=max_bisect_steps)
+    def pdf(self, x, df, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        x_f = sf.as_float64(x)
+        df_val = float(df)
+        scale_f = float(scale)
+        if df_val <= 0.0 or scale_f <= 0.0:
+            return sf.where(x_f * 0 + 1, float("nan"), float("nan"))
+        z = (x_f - float(loc)) / scale_f
+        half_nu = df_val / 2.0
+        log_coef = (
+            sf.gammaln((df_val + 1.0) / 2.0)
+            - sf.gammaln(half_nu)
+            - 0.5 * (sf.log(df_val) + sf.log(sf.pi))
+        )
+        log_pdf = (
+            log_coef
+            - ((df_val + 1.0) / 2.0) * sf.log1p(sf.square(z) / df_val)
+            - sf.log(scale_f)
+        )
+        return sf.exp(log_pdf)
+    def two_sided_pvalue(self, stat_abs, df):
+        return self._two_sided_pvalue_standard(stat_abs, df)
+    def two_sided_critical_value(self, alpha, df, *, max_bisect_steps=60):
+        sf = self._sf
+        a = float(alpha)
+        if not (0.0 < a < 1.0):
+            return sf.as_float64(float("nan"))
+        return self._ppf_standard(1.0 - a / 2.0, df, max_bisect_steps=max_bisect_steps)
+    def rvs(self, df, *, size=None, loc=0.0, scale=1.0, dtype=None):
+        return _rvs_t(self._sf, df=df, size=size, loc=loc, scale=scale)
+class UniformDistributionBase:
+    def __init__(self, sf: SpecialFunctions):
+        self._sf = sf
+    def cdf(self, x, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        scale_f = float(scale)
+        x_f = sf.as_float64(x)
+        if scale_f <= 0.0:
+            return sf.where(x_f * 0 + 1, float("nan"), float("nan"))
+        z = (x_f - float(loc)) / scale_f
+        return sf.clip(z, 0.0, 1.0)
+    def sf(self, x, *, loc=0.0, scale=1.0):
+        return sf_safe_sub(1.0, self.cdf(x, loc=loc, scale=scale), self._sf)
+    def ppf(self, q, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        scale_f = float(scale)
+        q_f = sf.as_float64(q)
+        out = sf.where(q_f * 0 + 1, float("nan"), float("nan"))
+        if scale_f <= 0.0:
+            return out
+        valid = (q_f >= 0.0) & (q_f <= 1.0)
+        return sf.where(valid, float(loc) + scale_f * q_f, out)
+    def isf(self, q, *, loc=0.0, scale=1.0):
+        return self.ppf(1.0 - self._sf.as_float64(q), loc=loc, scale=scale)
+    def pdf(self, x, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        scale_f = float(scale)
+        x_f = sf.as_float64(x)
+        if scale_f <= 0.0:
+            return sf.where(x_f * 0 + 1, float("nan"), float("nan"))
+        z = (x_f - float(loc)) / scale_f
+        in_support = (z >= 0.0) & (z <= 1.0)
+        return sf.where(in_support, 1.0 / scale_f, 0.0)
+    def rvs(self, *, size=None, loc=0.0, scale=1.0, dtype=None):
+        return _rvs_uniform(self._sf, size=size, loc=loc, scale=scale)
+class ExponDistributionBase:
+    def __init__(self, sf: SpecialFunctions):
+        self._sf = sf
+    def cdf(self, x, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        x_f = sf.as_float64(x)
+        scale_f = float(scale)
+        if scale_f <= 0.0:
+            return sf.where(x_f * 0 + 1, float("nan"), float("nan"))
+        z = (x_f - float(loc)) / scale_f
+        return sf.where(z <= 0.0, 0.0, 1.0 - sf.exp(-z))
+    def sf(self, x, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        x_f = sf.as_float64(x)
+        scale_f = float(scale)
+        if scale_f <= 0.0:
+            return sf.where(x_f * 0 + 1, float("nan"), float("nan"))
+        z = (x_f - float(loc)) / scale_f
+        return sf.where(z <= 0.0, 1.0, sf.exp(-z))
+    def ppf(self, q, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        scale_f = float(scale)
+        q_f = sf.as_float64(q)
+        out = sf.where(q_f * 0 + 1, float("nan"), float("nan"))
+        if scale_f <= 0.0:
+            return out
+        out = sf.where(q_f == 0.0, float(loc), out)
+        out = sf.where(q_f == 1.0, float("inf"), out)
+        valid = (q_f > 0.0) & (q_f < 1.0)
+        return sf.where(valid, float(loc) - scale_f * sf.log1p(-q_f), out)
+    def isf(self, q, *, loc=0.0, scale=1.0):
+        return self.ppf(1.0 - self._sf.as_float64(q), loc=loc, scale=scale)
+    def pdf(self, x, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        x_f = sf.as_float64(x)
+        scale_f = float(scale)
+        if scale_f <= 0.0:
+            return sf.where(x_f * 0 + 1, float("nan"), float("nan"))
+        z = (x_f - float(loc)) / scale_f
+        return sf.where(z >= 0.0, sf.exp(-z) / scale_f, 0.0)
+    def rvs(self, *, size=None, loc=0.0, scale=1.0, dtype=None):
+        return _rvs_expon(self._sf, size=size, loc=loc, scale=scale)
+class CauchyDistributionBase:
+    def __init__(self, sf: SpecialFunctions):
+        self._sf = sf
+    def cdf(self, x, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        scale_f = float(scale)
+        x_f = sf.as_float64(x)
+        if scale_f <= 0.0:
+            return sf.where(x_f * 0 + 1, float("nan"), float("nan"))
+        z = (x_f - float(loc)) / scale_f
+        return 0.5 + sf.atan(z) / sf.pi
+    def sf(self, x, *, loc=0.0, scale=1.0):
+        return sf_safe_sub(1.0, self.cdf(x, loc=loc, scale=scale), self._sf)
+    def ppf(self, q, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        scale_f = float(scale)
+        q_f = sf.as_float64(q)
+        out = sf.where(q_f * 0 + 1, float("nan"), float("nan"))
+        if scale_f <= 0.0:
+            return out
+        out = sf.where(q_f == 0.0, -float("inf"), out)
+        out = sf.where(q_f == 1.0, float("inf"), out)
+        valid = (q_f > 0.0) & (q_f < 1.0)
+        return sf.where(valid, float(loc) + scale_f * sf.tan(sf.pi * (q_f - 0.5)), out)
+    def isf(self, q, *, loc=0.0, scale=1.0):
+        return self.ppf(1.0 - self._sf.as_float64(q), loc=loc, scale=scale)
+    def pdf(self, x, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        scale_f = float(scale)
+        x_f = sf.as_float64(x)
+        if scale_f <= 0.0:
+            return sf.where(x_f * 0 + 1, float("nan"), float("nan"))
+        z = (x_f - float(loc)) / scale_f
+        return 1.0 / (sf.pi * scale_f * (1.0 + sf.square(z)))
+    def rvs(self, *, size=None, loc=0.0, scale=1.0, dtype=None):
+        return _rvs_cauchy(self._sf, size=size, loc=loc, scale=scale)
+class LaplaceDistributionBase:
+    def __init__(self, sf: SpecialFunctions):
+        self._sf = sf
+    def cdf(self, x, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        scale_f = float(scale)
+        x_f = sf.as_float64(x)
+        if scale_f <= 0.0:
+            return sf.where(x_f * 0 + 1, float("nan"), float("nan"))
+        z = (x_f - float(loc)) / scale_f
+        return sf.where(z < 0.0, 0.5 * sf.exp(z), 1.0 - 0.5 * sf.exp(-z))
+    def sf(self, x, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        scale_f = float(scale)
+        x_f = sf.as_float64(x)
+        if scale_f <= 0.0:
+            return sf.where(x_f * 0 + 1, float("nan"), float("nan"))
+        z = (x_f - float(loc)) / scale_f
+        return sf.where(z < 0.0, 1.0 - 0.5 * sf.exp(z), 0.5 * sf.exp(-z))
+    def ppf(self, q, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        scale_f = float(scale)
+        q_f = sf.as_float64(q)
+        out = sf.where(q_f * 0 + 1, float("nan"), float("nan"))
+        if scale_f <= 0.0:
+            return out
+        out = sf.where(q_f == 0.0, -float("inf"), out)
+        out = sf.where(q_f == 1.0, float("inf"), out)
+        lower = (q_f > 0.0) & (q_f < 0.5)
+        upper = (q_f >= 0.5) & (q_f < 1.0)
+        out = sf.where(lower, float(loc) + scale_f * sf.log(2.0 * q_f), out)
+        out = sf.where(upper, float(loc) - scale_f * sf.log(2.0 * (1.0 - q_f)), out)
+        return out
+    def isf(self, q, *, loc=0.0, scale=1.0):
+        return self.ppf(1.0 - self._sf.as_float64(q), loc=loc, scale=scale)
+    def pdf(self, x, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        scale_f = float(scale)
+        x_f = sf.as_float64(x)
+        if scale_f <= 0.0:
+            return sf.where(x_f * 0 + 1, float("nan"), float("nan"))
+        z = sf.abs((x_f - float(loc)) / scale_f)
+        return 0.5 * sf.exp(-z) / scale_f
+    def rvs(self, *, size=None, loc=0.0, scale=1.0, dtype=None):
+        return _rvs_laplace(self._sf, size=size, loc=loc, scale=scale)
+class LogisticDistributionBase:
+    def __init__(self, sf: SpecialFunctions):
+        self._sf = sf
+    def cdf(self, x, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        scale_f = float(scale)
+        x_f = sf.as_float64(x)
+        if scale_f <= 0.0:
+            return sf.where(x_f * 0 + 1, float("nan"), float("nan"))
+        z = (x_f - float(loc)) / scale_f
+        return 1.0 / (1.0 + sf.exp(-z))
+    def sf(self, x, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        scale_f = float(scale)
+        x_f = sf.as_float64(x)
+        if scale_f <= 0.0:
+            return sf.where(x_f * 0 + 1, float("nan"), float("nan"))
+        z = (x_f - float(loc)) / scale_f
+        return 1.0 / (1.0 + sf.exp(z))
+    def ppf(self, q, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        scale_f = float(scale)
+        q_f = sf.as_float64(q)
+        out = sf.where(q_f * 0 + 1, float("nan"), float("nan"))
+        if scale_f <= 0.0:
+            return out
+        out = sf.where(q_f == 0.0, -float("inf"), out)
+        out = sf.where(q_f == 1.0, float("inf"), out)
+        valid = (q_f > 0.0) & (q_f < 1.0)
+        return sf.where(valid, float(loc) + scale_f * sf.log(q_f / (1.0 - q_f)), out)
+    def isf(self, q, *, loc=0.0, scale=1.0):
+        return self.ppf(1.0 - self._sf.as_float64(q), loc=loc, scale=scale)
+    def pdf(self, x, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        cdf_x = self.cdf(x, loc=loc, scale=scale)
+        scale_f = float(scale)
+        if scale_f <= 0.0:
+            return cdf_x
+        return cdf_x * (1.0 - cdf_x) / scale_f
+    def rvs(self, *, size=None, loc=0.0, scale=1.0, dtype=None):
+        return _rvs_logistic(self._sf, size=size, loc=loc, scale=scale)
+class Chi2DistributionBase:
+    def __init__(self, sf: SpecialFunctions):
+        self._sf = sf
+    def cdf(self, x, df):
+        sf = self._sf
+        x_f = sf.as_float64(x)
+        df_f = float(df)
+        if df_f <= 0.0:
+            return sf.where(x_f * 0 + 1, float("nan"), float("nan"))
+        y = x_f / 2.0
+        return sf.where(x_f <= 0.0, 0.0, sf.gammainc(df_f / 2.0, y))
+    def sf(self, x, df):
+        sf = self._sf
+        x_f = sf.as_float64(x)
+        df_f = float(df)
+        if df_f <= 0.0:
+            return sf.where(x_f * 0 + 1, float("nan"), float("nan"))
+        y = x_f / 2.0
+        return sf.where(x_f <= 0.0, 1.0, sf.gammaincc(df_f / 2.0, y))
+    def ppf(self, q, df):
+        sf = self._sf
+        q_f = sf.as_float64(q)
+        df_f = float(df)
+        out = sf.where(q_f * 0 + 1, float("nan"), float("nan"))
+        if df_f <= 0.0:
+            return out
+        out = sf.where(q_f == 0.0, 0.0, out)
+        out = sf.where(q_f == 1.0, float("inf"), out)
+        valid = (q_f > 0.0) & (q_f < 1.0)
+        return sf.where(valid, 2.0 * sf.gammaincinv(df_f / 2.0, q_f), out)
+    def isf(self, q, df):
+        return self.ppf(1.0 - self._sf.as_float64(q), df)
+    def pdf(self, x, df):
+        sf = self._sf
+        x_f = sf.as_float64(x)
+        df_f = float(df)
+        if df_f <= 0.0:
+            return sf.where(x_f * 0 + 1, float("nan"), float("nan"))
+        y = sf.maximum(x_f, 1e-300)
+        logpdf = ((df_f / 2.0) - 1.0) * sf.log(y) - y / 2.0 - (df_f / 2.0) * sf.log(2.0) - sf.gammaln(df_f / 2.0)
+        return sf.where(x_f > 0.0, sf.exp(logpdf), 0.0)
+    def rvs(self, df, *, size=None, dtype=None):
+        return _rvs_chi2(self._sf, df=df, size=size)
+class GammaDistributionBase:
+    def __init__(self, sf: SpecialFunctions):
+        self._sf = sf
+    def cdf(self, x, a, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        x_f = sf.as_float64(x)
+        a_f = float(a)
+        scale_f = float(scale)
+        if a_f <= 0.0 or scale_f <= 0.0:
+            return sf.where(x_f * 0 + 1, float("nan"), float("nan"))
+        y = (x_f - float(loc)) / scale_f
+        return sf.where(y <= 0.0, 0.0, sf.gammainc(a_f, y))
+    def sf(self, x, a, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        x_f = sf.as_float64(x)
+        a_f = float(a)
+        scale_f = float(scale)
+        if a_f <= 0.0 or scale_f <= 0.0:
+            return sf.where(x_f * 0 + 1, float("nan"), float("nan"))
+        y = (x_f - float(loc)) / scale_f
+        return sf.where(y <= 0.0, 1.0, sf.gammaincc(a_f, y))
+    def ppf(self, q, a, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        q_f = sf.as_float64(q)
+        a_f = float(a)
+        scale_f = float(scale)
+        out = sf.where(q_f * 0 + 1, float("nan"), float("nan"))
+        if a_f <= 0.0 or scale_f <= 0.0:
+            return out
+        out = sf.where(q_f == 0.0, float(loc), out)
+        out = sf.where(q_f == 1.0, float("inf"), out)
+        valid = (q_f > 0.0) & (q_f < 1.0)
+        return sf.where(valid, float(loc) + scale_f * sf.gammaincinv(a_f, q_f), out)
+    def isf(self, q, a, *, loc=0.0, scale=1.0):
+        return self.ppf(1.0 - self._sf.as_float64(q), a, loc=loc, scale=scale)
+    def pdf(self, x, a, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        x_f = sf.as_float64(x)
+        a_f = float(a)
+        scale_f = float(scale)
+        if a_f <= 0.0 or scale_f <= 0.0:
+            return sf.where(x_f * 0 + 1, float("nan"), float("nan"))
+        y = (x_f - float(loc)) / scale_f
+        y_safe = sf.maximum(y, 1e-300)
+        logpdf = (a_f - 1.0) * sf.log(y_safe) - y_safe - sf.gammaln(a_f) - sf.log(scale_f)
+        return sf.where(y > 0.0, sf.exp(logpdf), 0.0)
+    def rvs(self, a, *, size=None, loc=0.0, scale=1.0, dtype=None):
+        return _rvs_gamma(self._sf, a=a, size=size, loc=loc, scale=scale)
+class BetaDistributionBase:
+    def __init__(self, sf: SpecialFunctions):
+        self._sf = sf
+    def cdf(self, x, a, b, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        x_f = sf.as_float64(x)
+        a_f = float(a)
+        b_f = float(b)
+        scale_f = float(scale)
+        if a_f <= 0.0 or b_f <= 0.0 or scale_f <= 0.0:
+            return sf.where(x_f * 0 + 1, float("nan"), float("nan"))
+        y = (x_f - float(loc)) / scale_f
+        core = sf.betainc(a_f, b_f, sf.clip(y, 0.0, 1.0))
+        out = sf.where(y <= 0.0, 0.0, core)
+        return sf.where(y >= 1.0, 1.0, out)
+    def sf(self, x, a, b, *, loc=0.0, scale=1.0):
+        return sf_safe_sub(1.0, self.cdf(x, a, b, loc=loc, scale=scale), self._sf)
+    def ppf(self, q, a, b, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        q_f = sf.as_float64(q)
+        a_f = float(a)
+        b_f = float(b)
+        scale_f = float(scale)
+        out = sf.where(q_f * 0 + 1, float("nan"), float("nan"))
+        if a_f <= 0.0 or b_f <= 0.0 or scale_f <= 0.0:
+            return out
+        out = sf.where(q_f == 0.0, float(loc), out)
+        out = sf.where(q_f == 1.0, float(loc) + scale_f, out)
+        valid = (q_f > 0.0) & (q_f < 1.0)
+        return sf.where(valid, float(loc) + scale_f * sf.betaincinv(a_f, b_f, q_f), out)
+    def isf(self, q, a, b, *, loc=0.0, scale=1.0):
+        return self.ppf(1.0 - self._sf.as_float64(q), a, b, loc=loc, scale=scale)
+    def pdf(self, x, a, b, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        x_f = sf.as_float64(x)
+        a_f = float(a)
+        b_f = float(b)
+        scale_f = float(scale)
+        if a_f <= 0.0 or b_f <= 0.0 or scale_f <= 0.0:
+            return sf.where(x_f * 0 + 1, float("nan"), float("nan"))
+        y = (x_f - float(loc)) / scale_f
+        y_safe = sf.clip(y, 1e-300, 1.0 - 1e-300)
+        betaln = sf.gammaln(a_f) + sf.gammaln(b_f) - sf.gammaln(a_f + b_f)
+        logpdf = (a_f - 1.0) * sf.log(y_safe) + (b_f - 1.0) * sf.log1p(-y_safe) - betaln - sf.log(scale_f)
+        in_support = (y > 0.0) & (y < 1.0)
+        return sf.where(in_support, sf.exp(logpdf), 0.0)
+    def rvs(self, a, b, *, size=None, loc=0.0, scale=1.0, dtype=None):
+        return _rvs_beta(self._sf, a=a, b=b, size=size, loc=loc, scale=scale)
+class FDistributionBase:
+    def __init__(self, sf: SpecialFunctions):
+        self._sf = sf
+    def cdf(self, x, dfn, dfd):
+        sf = self._sf
+        x_f = sf.as_float64(x)
+        dfn_f = float(dfn)
+        dfd_f = float(dfd)
+        if dfn_f <= 0.0 or dfd_f <= 0.0:
+            return sf.where(x_f * 0 + 1, float("nan"), float("nan"))
+        z = (dfn_f * sf.maximum(x_f, 0.0)) / (dfn_f * sf.maximum(x_f, 0.0) + dfd_f)
+        core = sf.betainc(dfn_f / 2.0, dfd_f / 2.0, z)
+        return sf.where(x_f <= 0.0, 0.0, core)
+    def sf(self, x, dfn, dfd):
+        return sf_safe_sub(1.0, self.cdf(x, dfn, dfd), self._sf)
+    def ppf(self, q, dfn, dfd):
+        sf = self._sf
+        q_f = sf.as_float64(q)
+        dfn_f = float(dfn)
+        dfd_f = float(dfd)
+        out = sf.where(q_f * 0 + 1, float("nan"), float("nan"))
+        if dfn_f <= 0.0 or dfd_f <= 0.0:
+            return out
+        out = sf.where(q_f == 0.0, 0.0, out)
+        out = sf.where(q_f == 1.0, float("inf"), out)
+        valid = (q_f > 0.0) & (q_f < 1.0)
+        z = sf.betaincinv(dfn_f / 2.0, dfd_f / 2.0, q_f)
+        return sf.where(valid, (dfd_f * z) / (dfn_f * (1.0 - z)), out)
+    def isf(self, q, dfn, dfd):
+        return self.ppf(1.0 - self._sf.as_float64(q), dfn, dfd)
+    def pdf(self, x, dfn, dfd):
+        sf = self._sf
+        x_f = sf.as_float64(x)
+        dfn_f = float(dfn)
+        dfd_f = float(dfd)
+        if dfn_f <= 0.0 or dfd_f <= 0.0:
+            return sf.where(x_f * 0 + 1, float("nan"), float("nan"))
+        a = dfn_f / 2.0
+        b = dfd_f / 2.0
+        x_safe = sf.maximum(x_f, 1e-300)
+        betaln = sf.gammaln(a) + sf.gammaln(b) - sf.gammaln(a + b)
+        logpdf = a * sf.log(dfn_f / dfd_f) + (a - 1.0) * sf.log(x_safe) - betaln - (a + b) * sf.log1p((dfn_f / dfd_f) * x_safe)
+        return sf.where(x_f > 0.0, sf.exp(logpdf), 0.0)
+    def rvs(self, dfn, dfd, *, size=None, dtype=None):
+        return _rvs_f(self._sf, dfn=dfn, dfd=dfd, size=size)
+class WeibullMinDistributionBase:
+    def __init__(self, sf: SpecialFunctions):
+        self._sf = sf
+    def cdf(self, x, c, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        x_f = sf.as_float64(x)
+        c_f = float(c)
+        scale_f = float(scale)
+        if c_f <= 0.0 or scale_f <= 0.0:
+            return sf.where(x_f * 0 + 1, float("nan"), float("nan"))
+        y = (x_f - float(loc)) / scale_f
+        yc = sf.power(sf.maximum(y, 0.0), c_f)
+        return sf.where(y <= 0.0, 0.0, 1.0 - sf.exp(-yc))
+    def sf(self, x, c, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        x_f = sf.as_float64(x)
+        c_f = float(c)
+        scale_f = float(scale)
+        if c_f <= 0.0 or scale_f <= 0.0:
+            return sf.where(x_f * 0 + 1, float("nan"), float("nan"))
+        y = (x_f - float(loc)) / scale_f
+        yc = sf.power(sf.maximum(y, 0.0), c_f)
+        return sf.where(y <= 0.0, 1.0, sf.exp(-yc))
+    def ppf(self, q, c, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        q_f = sf.as_float64(q)
+        c_f = float(c)
+        scale_f = float(scale)
+        out = sf.where(q_f * 0 + 1, float("nan"), float("nan"))
+        if c_f <= 0.0 or scale_f <= 0.0:
+            return out
+        out = sf.where(q_f == 0.0, float(loc), out)
+        out = sf.where(q_f == 1.0, float("inf"), out)
+        valid = (q_f > 0.0) & (q_f < 1.0)
+        return sf.where(valid, float(loc) + scale_f * sf.power(-sf.log1p(-q_f), 1.0 / c_f), out)
+    def isf(self, q, c, *, loc=0.0, scale=1.0):
+        return self.ppf(1.0 - self._sf.as_float64(q), c, loc=loc, scale=scale)
+    def pdf(self, x, c, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        x_f = sf.as_float64(x)
+        c_f = float(c)
+        scale_f = float(scale)
+        if c_f <= 0.0 or scale_f <= 0.0:
+            return sf.where(x_f * 0 + 1, float("nan"), float("nan"))
+        y = (x_f - float(loc)) / scale_f
+        y_pos = sf.maximum(y, 1e-300)
+        logpdf = sf.log(c_f / scale_f) + (c_f - 1.0) * sf.log(y_pos) - sf.power(y_pos, c_f)
+        return sf.where(y > 0.0, sf.exp(logpdf), 0.0)
+    def rvs(self, c, *, size=None, loc=0.0, scale=1.0, dtype=None):
+        return _rvs_weibull(self._sf, c=c, size=size, loc=loc, scale=scale)
+class LognormDistributionBase:
+    def __init__(self, sf: SpecialFunctions, norm_dist: NormDistributionBase):
+        self._sf = sf
+        self._norm = norm_dist
+    def cdf(self, x, s, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        x_f = sf.as_float64(x)
+        s_f = float(s)
+        scale_f = float(scale)
+        if s_f <= 0.0 or scale_f <= 0.0:
+            return sf.where(x_f * 0 + 1, float("nan"), float("nan"))
+        y = (x_f - float(loc)) / scale_f
+        z = sf.log(sf.maximum(y, 1e-300)) / s_f
+        return sf.where(y <= 0.0, 0.0, self._norm._cdf_standard(z))
+    def sf(self, x, s, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        x_f = sf.as_float64(x)
+        s_f = float(s)
+        scale_f = float(scale)
+        if s_f <= 0.0 or scale_f <= 0.0:
+            return sf.where(x_f * 0 + 1, float("nan"), float("nan"))
+        y = (x_f - float(loc)) / scale_f
+        z = sf.log(sf.maximum(y, 1e-300)) / s_f
+        return sf.where(y <= 0.0, 1.0, self._norm._sf_standard(z))
+    def ppf(self, q, s, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        q_f = sf.as_float64(q)
+        s_f = float(s)
+        scale_f = float(scale)
+        out = sf.where(q_f * 0 + 1, float("nan"), float("nan"))
+        if s_f <= 0.0 or scale_f <= 0.0:
+            return out
+        out = sf.where(q_f == 0.0, float(loc), out)
+        out = sf.where(q_f == 1.0, float("inf"), out)
+        valid = (q_f > 0.0) & (q_f < 1.0)
+        return sf.where(valid, float(loc) + scale_f * sf.exp(s_f * self._norm._ppf_standard(q_f)), out)
+    def isf(self, q, s, *, loc=0.0, scale=1.0):
+        return self.ppf(1.0 - self._sf.as_float64(q), s, loc=loc, scale=scale)
+    def pdf(self, x, s, *, loc=0.0, scale=1.0):
+        sf = self._sf
+        x_f = sf.as_float64(x)
+        s_f = float(s)
+        scale_f = float(scale)
+        if s_f <= 0.0 or scale_f <= 0.0:
+            return sf.where(x_f * 0 + 1, float("nan"), float("nan"))
+        y = (x_f - float(loc)) / scale_f
+        y_pos = sf.maximum(y, 1e-300)
+        z = sf.log(y_pos) / s_f
+        logpdf = -0.5 * sf.square(z) - sf.log(y_pos * s_f * sf.sqrt(2.0 * sf.pi)) - sf.log(scale_f)
+        return sf.where(y > 0.0, sf.exp(logpdf), 0.0)
+    def rvs(self, s, *, size=None, loc=0.0, scale=1.0, dtype=None):
+        return _rvs_lognorm(self._sf, s=s, size=size, loc=loc, scale=scale)
+class PoissonDistributionBase:
+    def __init__(self, sf: SpecialFunctions):
+        self._sf = sf
+    def _ppf_search(self, q, mu):
+        sf = self._sf
+        q_f = sf.as_float64(q)
+        mu_f = float(mu)
+        out = sf.where(q_f * 0 + 1, float("nan"), float("nan"))
+        if mu_f < 0.0:
+            return out
+        out = sf.where(q_f == 0.0, -1.0, out)
+        out = sf.where(q_f == 1.0, float("inf"), out)
+        valid = (q_f > 0.0) & (q_f < 1.0)
+        if not bool(sf.any(valid)):
+            return out
+        hi0 = float(max(1.0, np.ceil(mu_f + 10.0 * np.sqrt(mu_f + 1.0) + 10.0)))
+        low = sf.where(q_f * 0 + 1, -1.0, -1.0)
+        high = sf.where(q_f * 0 + 1, hi0, hi0)
+        for _ in range(16):
+            cdf_high = sf.where(high < 0.0, 0.0, sf.gammaincc(high + 1.0, mu_f))
+            need_expand = valid & (cdf_high < q_f)
+            high = sf.where(need_expand, sf.maximum(high * 2.0 + 1.0, 1.0), high)
+        max_high_f = float(np.max(sf.to_numpy(sf.where(valid, high, 0.0))))
+        steps = int(np.ceil(np.log2(max(max_high_f + 2.0, 2.0)))) + 2
+        for _ in range(max(1, steps)):
+            mid = sf.floor((low + high) / 2.0)
+            cdf_mid = sf.where(mid < 0.0, 0.0, sf.gammaincc(mid + 1.0, mu_f))
+            move_right = valid & (cdf_mid < q_f)
+            low = sf.where(move_right, mid, low)
+            high = sf.where(valid & (~move_right), mid, high)
+        k = sf.floor(high)
+        cdf_k = sf.where(k < 0.0, 0.0, sf.gammaincc(k + 1.0, mu_f))
+        k = sf.where(valid & (cdf_k < q_f), k + 1.0, k)
+        km1 = k - 1.0
+        cdf_km1 = sf.where(km1 < 0.0, 0.0, sf.gammaincc(k, mu_f))
+        return sf.where(valid & (km1 >= -1.0) & (cdf_km1 >= q_f), km1, sf.where(valid, k, out))
+    def pmf(self, k, mu, *, loc=0):
+        sf = self._sf
+        k_f = sf.as_float64(k) - float(loc)
+        mu_f = float(mu)
+        if mu_f < 0.0:
+            return sf.where(k_f * 0 + 1, float("nan"), float("nan"))
+        k_floor = sf.floor(k_f)
+        is_int = (k_floor == k_f)
+        valid = (k_f >= 0.0) & is_int
+        k_safe = sf.maximum(k_floor, 0.0)
+        logpmf = k_safe * sf.log(sf.maximum(mu_f, 1e-300)) - mu_f - sf.gammaln(k_safe + 1.0)
+        return sf.where(valid, sf.exp(logpmf), 0.0)
+    def cdf(self, k, mu, *, loc=0):
+        sf = self._sf
+        k_f = sf.as_float64(k) - float(loc)
+        mu_f = float(mu)
+        if mu_f < 0.0:
+            return sf.where(k_f * 0 + 1, float("nan"), float("nan"))
+        k_floor = sf.floor(k_f)
+        return sf.where(k_floor < 0.0, 0.0, sf.gammaincc(k_floor + 1.0, mu_f))
+    def sf(self, k, mu, *, loc=0):
+        return sf_safe_sub(1.0, self.cdf(k, mu, loc=loc), self._sf)
+    def ppf(self, q, mu, *, loc=0):
+        sf = self._sf
+        loc_f = float(loc)
+        q_f = sf.as_float64(q)
+        return self._ppf_search(q_f, mu) + loc_f
+    def isf(self, q, mu, *, loc=0):
+        return self.ppf(1.0 - self._sf.as_float64(q), mu, loc=loc)
+    def rvs(self, mu, *, size=None, loc=0, dtype=None):
+        return _rvs_poisson(self._sf, mu=mu, size=size, loc=loc)
+class BinomDistributionBase:
+    def __init__(self, sf: SpecialFunctions):
+        self._sf = sf
+    def _ppf_search(self, q, n, p):
+        sf = self._sf
+        q_f = sf.as_float64(q)
+        n_i = int(n)
+        p_f = float(p)
+        out = sf.where(q_f * 0 + 1, float("nan"), float("nan"))
+        if n_i < 0 or p_f < 0.0 or p_f > 1.0:
+            return out
+        out = sf.where(q_f == 0.0, -1.0, out)
+        out = sf.where(q_f == 1.0, float(n_i), out)
+        valid = (q_f > 0.0) & (q_f < 1.0)
+        if not bool(sf.any(valid)):
+            return out
+        low = sf.where(q_f * 0 + 1, -1.0, -1.0)
+        high = sf.where(q_f * 0 + 1, float(n_i), float(n_i))
+        steps = int(np.ceil(np.log2(max(n_i + 2, 2)))) + 2
+        for _ in range(max(1, steps)):
+            mid = sf.floor((low + high) / 2.0)
+            cdf_mid = self.cdf(mid, n_i, p_f, loc=0)
+            move_right = valid & (cdf_mid < q_f)
+            low = sf.where(move_right, mid, low)
+            high = sf.where(valid & (~move_right), mid, high)
+        k = sf.floor(high)
+        cdf_k = self.cdf(k, n_i, p_f, loc=0)
+        k = sf.where(valid & (cdf_k < q_f), k + 1.0, k)
+        km1 = k - 1.0
+        cdf_km1 = self.cdf(km1, n_i, p_f, loc=0)
+        return sf.where(valid & (km1 >= -1.0) & (cdf_km1 >= q_f), km1, sf.where(valid, k, out))
+    def pmf(self, k, n, p, *, loc=0):
+        sf = self._sf
+        n_i = int(n)
+        p_f = float(p)
+        k_f = sf.as_float64(k) - float(loc)
+        if n_i < 0 or p_f < 0.0 or p_f > 1.0:
+            return sf.where(k_f * 0 + 1, float("nan"), float("nan"))
+        k_floor = sf.floor(k_f)
+        is_int = (k_floor == k_f)
+        valid = (k_floor >= 0.0) & (k_floor <= float(n_i)) & is_int
+        k_safe = sf.clip(k_floor, 0.0, float(n_i))
+        logcoef = sf.gammaln(n_i + 1.0) - sf.gammaln(k_safe + 1.0) - sf.gammaln(n_i - k_safe + 1.0)
+        logpmf = logcoef + k_safe * sf.log(sf.maximum(p_f, 1e-300)) + (n_i - k_safe) * sf.log(sf.maximum(1.0 - p_f, 1e-300))
+        return sf.where(valid, sf.exp(logpmf), 0.0)
+    def cdf(self, k, n, p, *, loc=0):
+        sf = self._sf
+        n_i = int(n)
+        p_f = float(p)
+        k_f = sf.as_float64(k) - float(loc)
+        if n_i < 0 or p_f < 0.0 or p_f > 1.0:
+            return sf.where(k_f * 0 + 1, float("nan"), float("nan"))
+        k_floor = sf.floor(k_f)
+        out = sf.where(k_floor < 0.0, 0.0, sf.betainc(n_i - k_floor, k_floor + 1.0, 1.0 - p_f))
+        return sf.where(k_floor >= float(n_i), 1.0, out)
+    def sf(self, k, n, p, *, loc=0):
+        return sf_safe_sub(1.0, self.cdf(k, n, p, loc=loc), self._sf)
+    def ppf(self, q, n, p, *, loc=0):
+        sf = self._sf
+        loc_f = float(loc)
+        q_f = sf.as_float64(q)
+        return self._ppf_search(q_f, n, p) + loc_f
+    def isf(self, q, n, p, *, loc=0):
+        return self.ppf(1.0 - self._sf.as_float64(q), n, p, loc=loc)
+    def rvs(self, n, p, *, size=None, loc=0, dtype=None):
+        return _rvs_binom(self._sf, n=n, p=p, size=size, loc=loc)
+# =============================================================================
+# Approximation-based inverse special functions
+# =============================================================================
+# These provide fast, vectorized (numpy/cupy/torch) initial guesses for
+# gammaincinv and betaincinv.  Used by all three backends.
+# Reference:
+#   - Wilson-Hilferty (1931) for chi2/gamma
+#   - logit-normal approximation for beta (DiDonato & Morris 1996)
+#   - Newton refinement for 1-2 extra correct digits per step
+def _gammaincinv_wilson_hilferty(a, q):
+    """Wilson-Hilferty cube-root approximation for gammaincinv(a, q).
+    Returns an initial guess x ≈ gammaincinv(a, q).
+    Works for a > 0, q ∈ (0, 1).  Best for a >= 1.
+    """
+    import scipy.special as _scsp
+    a_f = float(a)
+    c = 1.0 / (9.0 * a_f)
+    s = math.sqrt(c)
+    z = -math.sqrt(2.0) * _scsp.erfcinv(2.0 * np.asarray(q, dtype=np.float64))
+    x = a_f * (1.0 - c + z * s) ** 3
+    return np.where(x > 0, x, 1e-10)
+def _gammaincinv_a_small(a, q):
+    """Approximation for gammaincinv(a, q) when a < 1.
+    Uses power series for small q and normal approximation for large q.
+    """
+    import scipy.special as _scsp
+    q_arr = np.asarray(q, dtype=np.float64)
+    a_f = float(a)
+    g_a1 = math.exp(_scsp.gammaln(a_f + 1.0))
+    # Series: P(a,x) ≈ x^a / Gamma(a+1) for small x → x ≈ (q * Gamma(a+1))^(1/a)
+    x_small = (q_arr * g_a1) ** (1.0 / a_f)
+    # For large q, use Wilson-Hilferty even though it's designed for a >= 1
+    x_large = _gammaincinv_wilson_hilferty(a_f, q_arr)
+    # Blend: use small approx when x_small < 1, large approx otherwise
+    return np.where(x_small < 1.0, x_small, x_large)
+def _gammaincinv_newton_numpy(a, q, x0, n_iter=3):
+    """Refine gammaincinv(a, q) using Newton's method.
+    x0: initial guess (numpy array)
+    n_iter: number of Newton refinement steps (default 3, each gives ~1 extra digit)
+    """
+    import scipy.special as scsp
+    x = np.asarray(x0, dtype=np.float64)
+    a_f = float(a)
+    log_ga = math.lgamma(a_f)
+    q_arr = np.asarray(q, dtype=np.float64)
+    for _ in range(n_iter):
+        p = scsp.gammainc(a_f, x)
+        diff = p - q_arr
+        if np.max(np.abs(diff)) < 1e-14:
+            break
+        log_deriv = (a_f - 1.0) * np.log(np.clip(x, 1e-300, None)) - x - log_ga
+        deriv = np.exp(log_deriv)
+        deriv = np.clip(deriv, 1e-300, 1e300)
+        x = x - diff / deriv
+        x = np.clip(x, 1e-15, 1e6)
+    return x
+def _betaincinv_logit_approx(a, b, q):
+    """Logit-normal approximation for betaincinv(a, b, q).
+    For Beta(a, b), the logit transform log(X/(1-X)) is approximately normal
+    with mean ψ(a) - ψ(b) and variance 1/a + 1/b (Digamma approximation).
+    """
+    import scipy.special as _scsp
+    a_f, b_f = float(a), float(b)
+    mu = _scsp.digamma(a_f) - _scsp.digamma(b_f)
+    sigma2 = 1.0 / a_f + 1.0 / b_f
+    sigma = math.sqrt(sigma2)
+    z = -math.sqrt(2.0) * _scsp.erfcinv(2.0 * np.asarray(q, dtype=np.float64))
+    logit_q = mu + sigma * z
+    x = 1.0 / (1.0 + np.exp(-logit_q))
+    return np.clip(x, 1e-15, 1.0 - 1e-15)
+def _betaincinv_newton_numpy(a, b, q, x0, n_iter=3):
+    """Refine betaincinv(a, b, q) using Newton's method.
+    x0: initial guess (numpy array)
+    n_iter: number of Newton refinement steps
+    """
+    import scipy.special as scsp
+    x = np.asarray(x0, dtype=np.float64)
+    a_f, b_f = float(a), float(b)
+    log_beta = math.lgamma(a_f) + math.lgamma(b_f) - math.lgamma(a_f + b_f)
+    q_arr = np.asarray(q, dtype=np.float64)
+    for _ in range(n_iter):
+        p = scsp.betainc(a_f, b_f, x)
+        diff = p - q_arr
+        if np.max(np.abs(diff)) < 1e-14:
+            break
+        log_deriv = (a_f - 1.0) * np.log(np.clip(x, 1e-300, None)) + \
+                    (b_f - 1.0) * np.log(np.clip(1.0 - x, 1e-300, None)) - log_beta
+        deriv = np.exp(log_deriv)
+        deriv = np.clip(deriv, 1e-300, 1e300)
+        x = x - diff / deriv
+        x = np.clip(x, 1e-15, 1.0 - 1e-15)
+    return x
+def _t_ppf_cornish_fisher(df, q):
+    """Cornish-Fisher expansion for Student-t quantile function.
+    Avoids the expensive betaincinv call.
+    Accuracy: ~1e-10 for df >= 2, ~1e-6 for df < 2.
+    """
+    import scipy.special as _scsp
+    z = -math.sqrt(2.0) * _scsp.erfcinv(2.0 * np.asarray(q, dtype=np.float64))
+    z2 = z * z
+    z3 = z2 * z
+    z5 = z3 * z2
+    df_f = float(df)
+    # Hall (1992) approximation for t quantile
+    d1 = 1.0 / (4.0 * df_f)
+    d2 = 1.0 / (96.0 * df_f * df_f)
+    d3 = 1.0 / (384.0 * df_f * df_f * df_f)
+    d4 = 1.0 / (9216.0 * df_f * df_f * df_f)
+    t_approx = z + (z3 + z) * d1 + (5.0 * z5 + 16.0 * z3 + 3.0 * z) * d2 + \
+               (3.0 * z5 + 19.0 * z3 + 17.0 * z) * d3 + \
+               (79.0 * z5 + 462.0 * z3 + 579.0 * z) * d4
+    return np.asarray(t_approx)
+def _t_ppf_hall_approx(df, q):
+    """Hall's (1992) approximation for t quantile.
+    More accurate than basic Cornish-Fisher, error ~1e-14 for df >= 1.
+    Uses the inverse of the regularized incomplete beta via a
+    transformed normal approximation.
+    """
+    import scipy.special as scsp
+    df_f = float(df)
+    # Fisher-Cornish expansion
+    z = -math.sqrt(2.0) * scsp.erfcinv(2.0 * q)
+    z2 = z * z
+    z3 = z2 * z
+    z4 = z3 * z
+    z5 = z4 * z
+    # Coefficients from Hall (1992) Biometrika
+    a1 = 1.0 / 4.0
+    a2 = 1.0 / 96.0
+    a3 = -1.0 / 96.0
+    a4 = -1.0 / 384.0
+    nu = df_f
+    t = z + (z3 + z) * a1 / nu + \
+        (5.0 * z5 + 16.0 * z3 + 3.0 * z) * a2 / (nu * nu) + \
+        (3.0 * z5 + 19.0 * z3 + 17.0 * z) * a3 / (nu * nu * nu) + \
+        (79.0 * z5 + 462.0 * z3 + 579.0 * z) * a4 / (nu * nu * nu * nu)
+    return np.asarray(t)
+def _t_ppf_wilson_hilferty_approx(df, q):
+    """Wilson-Hilferty-type approximation for t PPF.
+    Uses the relationship t^2 ~ df * F(1, df) and approximates the
+    F quantile via chi2 approximation.
+    Best for |z| < 5 and df > 1.
+    """
+    import scipy.special as scsp
+    df_f = float(df)
+    q2 = q  # keep signed
+    # For signed quantiles, work with |z| and restore sign
+    sign = np.sign(q2 - 0.5)
+    sign = np.where(sign == 0, 1.0, sign)
+    q_abs = np.abs(q2 - 0.5) + 0.5  # always in (0.5, 1]
+    # z = Φ^{-1}(q)
+    z = -math.sqrt(2.0) * scsp.erfcinv(2.0 * q_abs)
+    z = z * sign
+    # Refinement: t ≈ z * (1 - 1/(4*df) + z^2/(96*df^2))^{-1/2} ...
+    # This is a simplified version of the Hall approximation
+    z2 = z * z
+    t = z * (1.0 + (z2 - 1.0) / (4.0 * df_f) + (5.0 * z2 * (z2 + 7.0) - 2.0) / (96.0 * df_f * df_f))
+    return np.asarray(t)
+# =============================================================================
+# Scalar-function helpers (atan, log, log1p, square, abs, power, floor)
+# =============================================================================
+# These need per-backend implementations.  We store them on the sf objects
+# but provide fallbacks for protocols that don't define them.
+def _scalar_op(sf, name, *args):
+    """Call a scalar operation, falling back to numpy if not on sf."""
+    fn = getattr(sf, name, None)
+    if fn is not None:
+        return fn(*args)
+    np_fn = getattr(np, name)
+    return np_fn(*[np.asarray(a) for a in args])
+class _SpecialFunctionsMixin:
+    """Mixin adding scalar ops to the three concrete SpecialFunctions impls."""
+    def sqrt(self, x):
+        return _scalar_op(type(self), "sqrt", x)
+    def log(self, x):
+        return _scalar_op(type(self), "log", x)
+    def log1p(self, x):
+        return _scalar_op(type(self), "log1p", x)
+    def square(self, x):
+        return _scalar_op(type(self), "square", x)
+    def abs(self, x):
+        return _scalar_op(type(self), "abs", x)
+    def power(self, x, y):
+        return _scalar_op(type(self), "power", x, y)
+    def floor(self, x):
+        return _scalar_op(type(self), "floor", x)
+    def atan(self, x):
+        return _scalar_op(type(self), "arctan", x)
+    def exp(self, x):
+        return _scalar_op(type(self), "exp", x)
+    def maximum(self, x, y):
+        return _scalar_op(type(self), "maximum", x, y)
+    def minimum(self, x, y):
+        return _scalar_op(type(self), "minimum", x, y)
+    def any(self, x):
+        return _scalar_op(type(self), "any", x)
+    def to_numpy(self, x):
+        return np.asarray(x)
+# Patch scalar ops into each concrete implementation
+for _cls in (CuPySpecialFunctions, TorchSpecialFunctions, ScipySpecialFunctions):
+    for _name in ("sqrt", "log", "log1p", "square", "abs", "power", "floor", "atan", "tan", "exp", "maximum", "minimum", "any", "to_numpy"):
+        _np_name = {"atan": "arctan", "tan": "tan"}.get(_name, _name)
+        if _name in ("power", "maximum", "minimum"):
+            def _make_bin(_n=_np_name):
+                return lambda self, x, y: getattr(np, _n)(np.asarray(x), np.asarray(y))
+            setattr(_cls, _name, _make_bin())
+        elif _name == "any":
+            def _make_any():
+                return lambda self, x: np.any(np.asarray(x))
+            setattr(_cls, _name, _make_any())
+        else:
+            def _make_fn(_n=_np_name):
+                return lambda self, x: getattr(np, _n)(np.asarray(x))
+            setattr(_cls, _name, _make_fn())
+# Now override with backend-native versions
+# Fix to_numpy for ScipySpecialFunctions (np.to_numpy doesn't exist)
+ScipySpecialFunctions.to_numpy = lambda self, x: np.asarray(x)
+CuPySpecialFunctions.sqrt = lambda self, x: self._cp.sqrt(self._cp.asarray(x, dtype=self._cp.float64))
+CuPySpecialFunctions.log = lambda self, x: self._cp.log(self._cp.asarray(x, dtype=self._cp.float64))
+CuPySpecialFunctions.log1p = lambda self, x: self._cp.log1p(self._cp.asarray(x, dtype=self._cp.float64))
+CuPySpecialFunctions.square = lambda self, x: self._cp.square(self._cp.asarray(x, dtype=self._cp.float64))
+CuPySpecialFunctions.abs = lambda self, x: self._cp.abs(self._cp.asarray(x, dtype=self._cp.float64))
+CuPySpecialFunctions.power = lambda self, x, y: self._cp.power(self._cp.asarray(x, dtype=self._cp.float64), self._cp.asarray(y, dtype=self._cp.float64))
+CuPySpecialFunctions.floor = lambda self, x: self._cp.floor(self._cp.asarray(x, dtype=self._cp.float64))
+CuPySpecialFunctions.atan = lambda self, x: self._cp.arctan(self._cp.asarray(x, dtype=self._cp.float64))
+CuPySpecialFunctions.tan = lambda self, x: self._cp.tan(self._cp.asarray(x, dtype=self._cp.float64))
+CuPySpecialFunctions.exp = lambda self, x: self._cp.exp(self._cp.asarray(x, dtype=self._cp.float64))
+CuPySpecialFunctions.maximum = lambda self, x, y: self._cp.maximum(self._cp.asarray(x, dtype=self._cp.float64), self._cp.asarray(y, dtype=self._cp.float64))
+CuPySpecialFunctions.minimum = lambda self, x, y: self._cp.minimum(self._cp.asarray(x, dtype=self._cp.float64), self._cp.asarray(y, dtype=self._cp.float64))
+CuPySpecialFunctions.any = lambda self, x: self._cp.any(x)
+CuPySpecialFunctions.to_numpy = lambda self, x: self._cp.asnumpy(x) if hasattr(x, 'get') else np.asarray(x)
+TorchSpecialFunctions.sqrt = lambda self, x: self._torch.sqrt(self._as_tensor(x))
+TorchSpecialFunctions.log = lambda self, x: self._torch.log(self._as_tensor(x))
+TorchSpecialFunctions.log1p = lambda self, x: self._torch.log1p(self._as_tensor(x))
+TorchSpecialFunctions.square = lambda self, x: self._torch.square(self._as_tensor(x))
+TorchSpecialFunctions.abs = lambda self, x: self._torch.abs(self._as_tensor(x))
+TorchSpecialFunctions.power = lambda self, x, y: self._torch.pow(self._as_tensor(x), self._as_tensor(y))
+TorchSpecialFunctions.floor = lambda self, x: self._torch.floor(self._as_tensor(x))
+TorchSpecialFunctions.atan = lambda self, x: self._torch.atan(self._as_tensor(x))
+TorchSpecialFunctions.tan = lambda self, x: self._torch.tan(self._as_tensor(x))
+TorchSpecialFunctions.exp = lambda self, x: self._torch.exp(self._as_tensor(x))
+TorchSpecialFunctions.maximum = lambda self, x, y: self._torch.maximum(self._as_tensor(x), self._as_tensor(y))
+TorchSpecialFunctions.minimum = lambda self, x, y: self._torch.minimum(self._as_tensor(x), self._as_tensor(y))
+TorchSpecialFunctions.any = lambda self, x: self._torch.any(x)
+TorchSpecialFunctions.to_numpy = lambda self, x: x.detach().cpu().numpy() if hasattr(x, 'cpu') else np.asarray(x)
+# =============================================================================
+# Safe scalar operations (clip 1-x to [0,1] for SF computation)
+# =============================================================================
+def sf_safe_sub(val, other, sf):
+    """Compute val - other, clamping result to [0, 1]."""
+    return sf.clip(val - other, 0.0, 1.0)
+def sf_safe_mul(val, factor, sf):
+    """Compute val * factor, clamping result to [0, 1]."""
+    return sf.clip(val * factor, 0.0, 1.0)
+# =============================================================================
+# Random variate helpers (pure-numpy CPU generation, then converted)
+# =============================================================================
+def _rvs_normal(sf, *, size, loc, scale):
+    out = np.random.normal(loc=float(loc), scale=float(scale), size=size)
+    if hasattr(sf, 'as_float64'):
+        return sf.as_float64(out)
+    return out
+def _rvs_t(sf, *, df, size, loc, scale):
+    # Use scipy fallback for t-distribution rvs
+    import scipy.stats as sps
+    out = sps.t.rvs(df=float(df), size=size, loc=float(loc), scale=float(scale))
+    if hasattr(sf, 'as_float64'):
+        return sf.as_float64(out)
+    return out
+def _rvs_uniform(sf, *, size, loc, scale):
+    out = np.random.uniform(low=float(loc), high=float(loc) + float(scale), size=size)
+    if hasattr(sf, 'as_float64'):
+        return sf.as_float64(out)
+    return out
+def _rvs_expon(sf, *, size, loc, scale):
+    out = float(loc) + np.random.exponential(scale=float(scale), size=size)
+    if hasattr(sf, 'as_float64'):
+        return sf.as_float64(out)
+    return out
+def _rvs_cauchy(sf, *, size, loc, scale):
+    u = np.random.random(size=size)
+    out = float(loc) + float(scale) * np.tan(np.pi * (u - 0.5))
+    if hasattr(sf, 'as_float64'):
+        return sf.as_float64(out)
+    return out
+def _rvs_laplace(sf, *, size, loc, scale):
+    out = np.random.laplace(loc=float(loc), scale=float(scale), size=size)
+    if hasattr(sf, 'as_float64'):
+        return sf.as_float64(out)
+    return out
+def _rvs_logistic(sf, *, size, loc, scale):
+    u = np.random.random(size=size)
+    out = float(loc) + float(scale) * np.log(u / (1.0 - u))
+    if hasattr(sf, 'as_float64'):
+        return sf.as_float64(out)
+    return out
+def _rvs_chi2(sf, *, df, size):
+    out = np.random.chisquare(df=float(df), size=size)
+    if hasattr(sf, 'as_float64'):
+        return sf.as_float64(out)
+    return out
+def _rvs_gamma(sf, *, a, size, loc, scale):
+    out = float(loc) + np.random.gamma(shape=float(a), scale=float(scale), size=size)
+    if hasattr(sf, 'as_float64'):
+        return sf.as_float64(out)
+    return out
+def _rvs_beta(sf, *, a, b, size, loc, scale):
+    out = float(loc) + float(scale) * np.random.beta(float(a), float(b), size=size)
+    if hasattr(sf, 'as_float64'):
+        return sf.as_float64(out)
+    return out
+def _rvs_f(sf, *, dfn, dfd, size):
+    out = np.random.f(dfn=float(dfn), dfd=float(dfd), size=size)
+    if hasattr(sf, 'as_float64'):
+        return sf.as_float64(out)
+    return out
+def _rvs_weibull(sf, *, c, size, loc, scale):
+    out = float(loc) + float(scale) * np.random.weibull(a=float(c), size=size)
+    if hasattr(sf, 'as_float64'):
+        return sf.as_float64(out)
+    return out
+def _rvs_lognorm(sf, *, s, size, loc, scale):
+    out = float(loc) + float(scale) * np.exp(float(s) * np.random.normal(size=size))
+    if hasattr(sf, 'as_float64'):
+        return sf.as_float64(out)
+    return out
+def _rvs_poisson(sf, *, mu, size, loc):
+    out = np.random.poisson(lam=float(mu), size=size) + int(loc)
+    if hasattr(sf, 'as_float64'):
+        return sf.as_float64(out)
+    return out
+def _rvs_binom(sf, *, n, p, size, loc):
+    out = np.random.binomial(n=int(n), p=float(p), size=size) + int(loc)
+    if hasattr(sf, 'as_float64'):
+        return sf.as_float64(out)
+    return out
+# =============================================================================
+# Factory
+# =============================================================================
+_DISTRIBUTION_FACTORIES = {
+    "norm": lambda sf: NormDistributionBase(sf),
+    "t": lambda sf: TDistributionBase(sf),
+    "uniform": lambda sf: UniformDistributionBase(sf),
+    "expon": lambda sf: ExponDistributionBase(sf),
+    "cauchy": lambda sf: CauchyDistributionBase(sf),
+    "laplace": lambda sf: LaplaceDistributionBase(sf),
+    "logistic": lambda sf: LogisticDistributionBase(sf),
+    "chi2": lambda sf: Chi2DistributionBase(sf),
+    "gamma": lambda sf: GammaDistributionBase(sf),
+    "beta": lambda sf: BetaDistributionBase(sf),
+    "f": lambda sf: FDistributionBase(sf),
+    "weibull_min": lambda sf: WeibullMinDistributionBase(sf),
+    "lognorm": lambda sf: LognormDistributionBase(sf, NormDistributionBase(sf)),
+    "poisson": lambda sf: PoissonDistributionBase(sf),
+    "binom": lambda sf: BinomDistributionBase(sf),
+}
+_NATIVE_NAMES = sorted(_DISTRIBUTION_FACTORIES.keys())
+def _make_sf(backend: str, device: str | None = None, *, use_lut: bool = True) -> SpecialFunctions:
+    """Create a SpecialFunctions instance for the given backend name."""
+    if backend == "numpy":
+        return ScipySpecialFunctions(use_lut=use_lut)
+    if backend == "cupy":
+        return CuPySpecialFunctions(use_lut=use_lut)
+    if backend == "torch":
+        return TorchSpecialFunctions(device=device, use_lut=use_lut)
+    raise ValueError(f"Unsupported backend: {backend}")
+def get_distribution(name: str, backend: str = "auto", device: str | None = None, *, use_lut: bool = True):
+    """Get a distribution object for the given backend.
+    Parameters
+    ----------
+    name : str
+        Distribution name (e.g. ``'norm'``, ``'t'``, ``'chi2'``).
+    backend : {'auto', 'numpy', 'cupy', 'torch'}, default='auto'
+        Which backend to use.  ``'auto'`` picks the first available GPU
+        backend (cupy > torch) or falls back to numpy.
+    device : str, optional
+        Torch device string (e.g. ``'cuda'``, ``'cuda:0'``, ``'cpu'``).
+        Only used when backend is ``'torch'``.
+    use_lut : bool, default=True
+        Use LUT cache + 1-step Newton refinement for inverse special functions
+        (``betaincinv``, ``gammaincinv``).  When ``False``, falls back to the
+        full iterative solver (scipy for numpy, Newton-Raphson for torch).
+        ``True`` gives 10-500x speedup for ``t.ppf``/``f.ppf`` on GPU,
+        with negligible accuracy loss (LUT is built from scipy reference values).
+    Returns
+    -------
+    Distribution object with methods: cdf, sf, ppf, isf, pdf, rvs, etc.
+    """
+    if backend == "auto":
+        if CuPySpecialFunctions is not None:  # always importable if cupy installed
+            try:
+                return get_distribution(name, backend="cupy", device=device, use_lut=use_lut)
+            except Exception:
+                pass
+        try:
+            return get_distribution(name, backend="torch", device=device, use_lut=use_lut)
+        except Exception:
+            pass
+        backend = "numpy"
+    sf = _make_sf(backend, device, use_lut=use_lut)
+    factory = _DISTRIBUTION_FACTORIES.get(name)
+    if factory is None:
+        # Try case-insensitive
+        factory = _DISTRIBUTION_FACTORIES.get(name.lower())
+    if factory is None:
+        raise ValueError(f"Unknown distribution: {name}")
+    return factory(sf)
+def list_available_distributions():
+    """List all natively implemented distribution names."""
+    return list(_NATIVE_NAMES)
+# =============================================================================
+# DistributionProxy — module-level lazy singletons
+# =============================================================================
+class DistributionProxy:
+    """Lazy proxy that resolves the backend on each call.
+    Supports ``backend=`` keyword override::
+        norm.cdf(x)                        # auto backend
+        norm.cdf(x, backend="torch")       # force torch
+    """
+    def __init__(self, name: str, default_backend: str = "auto", device: str | None = None, *, use_lut: bool = True):
+        self._name = name
+        self._default_backend = default_backend
+        self._device = device
+        self._use_lut = use_lut
+    def _resolve(self, kwargs, *arrays):
+        from statgpu.backends import _is_torch_array, _resolve_backend
+        backend = kwargs.pop("backend", self._default_backend)
+        device = kwargs.pop("device", self._device)
+        use_lut = kwargs.pop("use_lut", self._use_lut)
+        if backend == "auto":
+            backend = _resolve_backend("auto", *arrays, *kwargs.values())
+        if backend == "torch" and device is None:
+            for arr in (*arrays, *kwargs.values()):
+                if _is_torch_array(arr):
+                    device = str(arr.device)
+                    break
+        return get_distribution(self._name, backend=backend, device=device, use_lut=use_lut)
+    def __repr__(self):
+        return (f"DistributionProxy({self._name!r}, "
+                f"backend={self._default_backend!r}, "
+                f"use_lut={self._use_lut!r})")
+    def cdf(self, x, **kw):
+        return self._resolve(kw, x).cdf(x, **kw)
+    def sf(self, x, **kw):
+        return self._resolve(kw, x).sf(x, **kw)
+    def ppf(self, q, **kw):
+        return self._resolve(kw, q).ppf(q, **kw)
+    def isf(self, q, **kw):
+        return self._resolve(kw, q).isf(q, **kw)
+    def pdf(self, x, **kw):
+        return self._resolve(kw, x).pdf(x, **kw)
+    def pmf(self, k, **kw):
+        return self._resolve(kw, k).pmf(k, **kw)
+    def rvs(self, **kw):
+        return self._resolve(kw, *kw.values()).rvs(**kw)
+    def two_sided_pvalue(self, stat_abs, **kw):
+        return self._resolve(kw, stat_abs).two_sided_pvalue(stat_abs, **kw)
+    def two_sided_critical_value(self, alpha, **kw):
+        return self._resolve(kw, alpha).two_sided_critical_value(alpha, **kw)
+# Module-level singletons (lazy, backend resolved per-call)
+norm = DistributionProxy("norm")
+t = DistributionProxy("t")
+uniform = DistributionProxy("uniform")
+expon = DistributionProxy("expon")
+cauchy = DistributionProxy("cauchy")
+laplace = DistributionProxy("laplace")
+logistic = DistributionProxy("logistic")
+chi2 = DistributionProxy("chi2")
+gamma = DistributionProxy("gamma")
+beta = DistributionProxy("beta")
+f = DistributionProxy("f")
+weibull_min = DistributionProxy("weibull_min")
+lognorm = DistributionProxy("lognorm")
+poisson = DistributionProxy("poisson")
+binom = DistributionProxy("binom")
+# Backward-compatible aliases (old CuPy-specific class names)
+NormDistributionGPU = NormDistributionBase
+TDistributionGPU = TDistributionBase
+UniformDistributionGPU = UniformDistributionBase
+ExponDistributionGPU = ExponDistributionBase
+CauchyDistributionGPU = CauchyDistributionBase
+LaplaceDistributionGPU = LaplaceDistributionBase
+LogisticDistributionGPU = LogisticDistributionBase
+Chi2DistributionGPU = Chi2DistributionBase
+GammaDistributionGPU = GammaDistributionBase
+BetaDistributionGPU = BetaDistributionBase
+FDistributionGPU = FDistributionBase
+WeibullMinDistributionGPU = WeibullMinDistributionBase
+LognormDistributionGPU = LognormDistributionBase
+PoissonDistributionGPU = PoissonDistributionBase
+BinomDistributionGPU = BinomDistributionBase
+def get_distribution_gpu(name: str, *, allow_fallback: bool = False):
+    """Backward-compatible wrapper: get GPU distribution by name.
+    Delegates to the unified factory, defaulting to the best GPU backend.
+    """
+    import scipy.stats as sps
+    key = str(name).strip()
+    if key.lower() in _DISTRIBUTION_FACTORIES:
+        return get_distribution(key.lower(), backend="auto")
+    if allow_fallback:
+        if hasattr(sps, key.lower()) or hasattr(sps, key):
+            return ScipyFallbackDistribution(key.lower() if hasattr(sps, key.lower()) else key)
+    if hasattr(sps, key.lower()) or hasattr(sps, key):
+        raise ValueError(
+            f"Distribution '{name}' has no native GPU implementation. "
+            "Set allow_fallback=True for explicit SciPy fallback."
+        )
+    raise ValueError(f"Unknown scipy.stats distribution: {name}")
+def list_available_distributions_gpu(include_scipy: bool = True):
+    """Backward-compatible: list available distribution names."""
+    native = list_available_distributions()
+    if not include_scipy:
+        return native
+    import scipy.stats as sps
+    from scipy.stats import rv_continuous, rv_discrete
+    scipy_names = []
+    for n in dir(sps):
+        if n.startswith("_"):
+            continue
+        try:
+            obj = getattr(sps, n)
+        except Exception:
+            continue
+        if isinstance(obj, (rv_continuous, rv_discrete)):
+            scipy_names.append(n)
+    return sorted(set(native + scipy_names))
+class ScipyFallbackDistribution:
+    """Dynamic scipy.stats distribution wrapper returning GPU-backed outputs."""
+    def __init__(self, name: str):
+        self.name = str(name)
+    def __repr__(self):
+        return f"ScipyFallbackDistribution('{self.name}')"
+    def _call(self, method_name, *args, **kwargs):
+        import scipy.stats as sps
+        dist = getattr(sps, self.name)
+        method = getattr(dist, method_name)
+        # Convert any GPU arrays to numpy for scipy
+        np_args = []
+        for v in args:
+            if hasattr(v, "get"):
+                np_args.append(v.get())
+            elif hasattr(v, "detach"):
+                np_args.append(v.detach().cpu().numpy())
+            else:
+                np_args.append(v)
+        np_kw = {}
+        for k, v in kwargs.items():
+            if hasattr(v, "get"):
+                np_kw[k] = v.get()
+            elif hasattr(v, "detach"):
+                np_kw[k] = v.detach().cpu().numpy()
+            else:
+                np_kw[k] = v
+        result = method(*np_args, **np_kw)
+        # Try to convert result back to GPU if default backend is GPU
+        try:
+            from statgpu.backends import get_backend
+            backend = get_backend()
+            if backend.name != "numpy":
+                return backend.asarray(result)
+        except Exception:
+            pass
+        return result
+    def cdf(self, x, *shape_args, **kwargs):
+        return self._call("cdf", x, *shape_args, **kwargs)
+    def sf(self, x, *shape_args, **kwargs):
+        return self._call("sf", x, *shape_args, **kwargs)
+    def ppf(self, q, *shape_args, **kwargs):
+        return self._call("ppf", q, *shape_args, **kwargs)
+    def isf(self, q, *shape_args, **kwargs):
+        return self._call("isf", q, *shape_args, **kwargs)
+    def pdf(self, x, *shape_args, **kwargs):
+        return self._call("pdf", x, *shape_args, **kwargs)
+    def pmf(self, x, *shape_args, **kwargs):
+        return self._call("pmf", x, *shape_args, **kwargs)
+    def rvs(self, *shape_args, size=None, dtype=None, **kwargs):
+        out = self._call("rvs", *shape_args, size=size, **kwargs)
+        if dtype is not None and hasattr(out, "astype"):
+            out = out.astype(dtype)
+        return out
+# =============================================================================
+# Backward-compatible special function aliases (for old consumers)
+# =============================================================================
+def regularized_betainc_gpu(a, b, x):
+    """Backward-compatible alias: use get_distribution for new code."""
+    sf = CuPySpecialFunctions()
+    return sf.betainc(a, b, x)
+def regularized_betaincinv_gpu(a, b, y):
+    """Backward-compatible alias."""
+    sf = CuPySpecialFunctions()
+    return sf.betaincinv(a, b, y)
+def gammainc_gpu(a, x):
+    """Backward-compatible alias."""
+    sf = CuPySpecialFunctions()
+    return sf.gammainc(a, x)
+def gammaincc_gpu(a, x):
+    """Backward-compatible alias."""
+    sf = CuPySpecialFunctions()
+    return sf.gammaincc(a, x)
+def gammaincinv_gpu(a, q):
+    """Backward-compatible alias."""
+    sf = CuPySpecialFunctions()
+    return sf.gammaincinv(a, q)
+def gammaln_gpu(x):
+    """Backward-compatible alias."""
+    sf = CuPySpecialFunctions()
+    return sf.gammaln(x)
+# =============================================================================
+# Legacy distribution-function names (R-style)
+# =============================================================================
+_LEGACY_DISTRIBUTION_FUNCTION_NAMES = {
+    "t_cdf_gpu", "t_sf_gpu", "t_ppf_gpu", "t_two_sided_pvalue_gpu",
+    "t_two_sided_critical_value_gpu", "norm_cdf_gpu", "norm_sf_gpu",
+    "norm_ppf_gpu", "norm_isf_gpu", "norm_two_sided_pvalue_gpu",
+    "norm_two_sided_critical_value_gpu", "rnorm_gpu", "dnorm_gpu",
+    "dt_gpu", "rt_gpu", "pnorm_gpu", "qnorm_gpu", "pt_gpu", "qt_gpu",
+    "dchisq_gpu", "pchisq_gpu", "qchisq_gpu", "rchisq_gpu",
+    "dgamma_gpu", "pgamma_gpu", "qgamma_gpu", "rgamma_gpu",
+    "dbeta_gpu", "pbeta_gpu", "qbeta_gpu", "rbeta_gpu",
+    "df_gpu", "pf_gpu", "qf_gpu", "rf_gpu",
+    "dpois_gpu", "ppois_gpu", "qpois_gpu", "rpois_gpu",
+    "dbinom_gpu", "pbinom_gpu", "qbinom_gpu", "rbinom_gpu",
+}
+def __getattr__(name):
+    """Lazy access to legacy distribution functions."""
+    if name.startswith("_"):
+        raise AttributeError(f"module {__name__} has no attribute {name}")
+    if name in _LEGACY_DISTRIBUTION_FUNCTION_NAMES:
+        from statgpu.linear_model.legacy import _distributions_legacy_gpu as legacy
+        return getattr(legacy, name)
+    try:
+        return get_distribution_gpu(name)
+    except Exception as exc:
+        raise AttributeError(f"module {__name__} has no attribute {name}") from exc
+# =============================================================================
+# Exports
+# =============================================================================
+__all__ = [
+    # Core
+    "get_distribution",
+    "list_available_distributions",
+    "DistributionProxy",
+    "SpecialFunctions",
+    # Backends
+    "CuPySpecialFunctions",
+    "TorchSpecialFunctions",
+    "ScipySpecialFunctions",
+    # Distributions
+    "NormDistributionBase",
+    "TDistributionBase",
+    "UniformDistributionBase",
+    "ExponDistributionBase",
+    "CauchyDistributionBase",
+    "LaplaceDistributionBase",
+    "LogisticDistributionBase",
+    "Chi2DistributionBase",
+    "GammaDistributionBase",
+    "BetaDistributionBase",
+    "FDistributionBase",
+    "WeibullMinDistributionBase",
+    "LognormDistributionBase",
+    "PoissonDistributionBase",
+    "BinomDistributionBase",
+    # Module-level proxies
+    "norm", "t", "uniform", "expon", "cauchy", "laplace",
+    "logistic", "chi2", "gamma", "beta", "f",
+    "weibull_min", "lognorm", "poisson", "binom",
+    # Backward compat
+    "NormDistributionGPU", "TDistributionGPU", "UniformDistributionGPU",
+    "ExponDistributionGPU", "CauchyDistributionGPU", "LaplaceDistributionGPU",
+    "LogisticDistributionGPU", "Chi2DistributionGPU", "GammaDistributionGPU",
+    "BetaDistributionGPU", "FDistributionGPU", "WeibullMinDistributionGPU",
+    "LognormDistributionGPU", "PoissonDistributionGPU", "BinomDistributionGPU",
+    "ScipyFallbackDistribution",
+    "get_distribution_gpu",
+    "list_available_distributions_gpu",
+]