PyPI - statgpu - Versions diffs - 0.1.0__py3-none-any.whl - Mend

statgpu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (168) hide show

statgpu/__init__.py +174 -0
statgpu/_base.py +544 -0
statgpu/_config.py +127 -0
statgpu/anova/__init__.py +5 -0
statgpu/anova/_oneway.py +194 -0
statgpu/backends/__init__.py +83 -0
statgpu/backends/_array_ops.py +529 -0
statgpu/backends/_base.py +184 -0
statgpu/backends/_cupy.py +453 -0
statgpu/backends/_factory.py +65 -0
statgpu/backends/_gpu_inference_cupy.py +214 -0
statgpu/backends/_gpu_inference_torch.py +422 -0
statgpu/backends/_numpy.py +324 -0
statgpu/backends/_torch.py +685 -0
statgpu/backends/_torch_safe.py +47 -0
statgpu/backends/_utils.py +423 -0
statgpu/core/__init__.py +10 -0
statgpu/core/formula/__init__.py +33 -0
statgpu/core/formula/_design.py +99 -0
statgpu/core/formula/_parser.py +191 -0
statgpu/core/formula/_terms.py +70 -0
statgpu/core/formula/tests/__init__.py +0 -0
statgpu/core/formula/tests/test_parser.py +194 -0
statgpu/covariance/__init__.py +6 -0
statgpu/covariance/_empirical.py +310 -0
statgpu/covariance/_shrinkage.py +248 -0
statgpu/cross_validation/__init__.py +31 -0
statgpu/cross_validation/_base.py +410 -0
statgpu/cross_validation/_engine.py +167 -0
statgpu/diagnostics/__init__.py +7 -0
statgpu/diagnostics/_regression_diagnostics.py +188 -0
statgpu/feature_selection/__init__.py +24 -0
statgpu/feature_selection/_knockoff.py +870 -0
statgpu/feature_selection/_knockoff_utils.py +1003 -0
statgpu/feature_selection/_stepwise.py +300 -0
statgpu/glm_core/__init__.py +81 -0
statgpu/glm_core/_base.py +202 -0
statgpu/glm_core/_family.py +362 -0
statgpu/glm_core/_fused.py +149 -0
statgpu/glm_core/_gamma.py +111 -0
statgpu/glm_core/_inverse_gaussian.py +62 -0
statgpu/glm_core/_irls.py +561 -0
statgpu/glm_core/_logistic.py +82 -0
statgpu/glm_core/_negative_binomial.py +68 -0
statgpu/glm_core/_poisson.py +60 -0
statgpu/glm_core/_solver_legacy.py +100 -0
statgpu/glm_core/_squared.py +53 -0
statgpu/glm_core/_tweedie.py +74 -0
statgpu/inference/__init__.py +239 -0
statgpu/inference/_distributions_backend.py +2610 -0
statgpu/inference/_multiple_testing.py +391 -0
statgpu/inference/_resampling.py +1400 -0
statgpu/inference/_results.py +265 -0
statgpu/linear_model/__init__.py +75 -0
statgpu/linear_model/_gaussian_inference.py +306 -0
statgpu/linear_model/_glm_base.py +1261 -0
statgpu/linear_model/_ordered_logit.py +52 -0
statgpu/linear_model/_ordered_probit.py +50 -0
statgpu/linear_model/_stats.py +170 -0
statgpu/linear_model/cv/__init__.py +13 -0
statgpu/linear_model/cv/_elasticnet_cv.py +892 -0
statgpu/linear_model/cv/_lasso_cv.py +253 -0
statgpu/linear_model/cv/_logistic_cv.py +895 -0
statgpu/linear_model/cv/_ridge_cv.py +1160 -0
statgpu/linear_model/legacy/__init__.py +1 -0
statgpu/linear_model/legacy/_distributions_legacy_gpu.py +340 -0
statgpu/linear_model/legacy/_elasticnet_legacy.py +936 -0
statgpu/linear_model/legacy/_lasso_legacy.py +4876 -0
statgpu/linear_model/legacy/_penalized_legacy.py +1174 -0
statgpu/linear_model/legacy/_ridge_legacy.py +863 -0
statgpu/linear_model/legacy/_solver_legacy.py +104 -0
statgpu/linear_model/penalized/__init__.py +25 -0
statgpu/linear_model/penalized/_base.py +437 -0
statgpu/linear_model/penalized/_fit_mixin.py +1877 -0
statgpu/linear_model/penalized/_inference_mixin.py +1179 -0
statgpu/linear_model/penalized/_penalized_cv.py +2699 -0
statgpu/linear_model/penalized/_penalized_gamma.py +86 -0
statgpu/linear_model/penalized/_penalized_inverse_gaussian.py +62 -0
statgpu/linear_model/penalized/_penalized_linear.py +236 -0
statgpu/linear_model/penalized/_penalized_logistic.py +100 -0
statgpu/linear_model/penalized/_penalized_negative_binomial.py +65 -0
statgpu/linear_model/penalized/_penalized_poisson.py +62 -0
statgpu/linear_model/penalized/_penalized_tweedie.py +65 -0
statgpu/linear_model/penalized/_predict_mixin.py +182 -0
statgpu/linear_model/wrappers/__init__.py +31 -0
statgpu/linear_model/wrappers/_adaptive_lasso.py +63 -0
statgpu/linear_model/wrappers/_elasticnet.py +75 -0
statgpu/linear_model/wrappers/_gamma.py +67 -0
statgpu/linear_model/wrappers/_inverse_gaussian.py +47 -0
statgpu/linear_model/wrappers/_lasso.py +2124 -0
statgpu/linear_model/wrappers/_linear.py +1127 -0
statgpu/linear_model/wrappers/_logistic.py +1435 -0
statgpu/linear_model/wrappers/_mcp.py +58 -0
statgpu/linear_model/wrappers/_negative_binomial.py +58 -0
statgpu/linear_model/wrappers/_poisson.py +48 -0
statgpu/linear_model/wrappers/_ridge.py +166 -0
statgpu/linear_model/wrappers/_scad.py +58 -0
statgpu/linear_model/wrappers/_tweedie.py +57 -0
statgpu/metrics/__init__.py +21 -0
statgpu/metrics/_classification.py +591 -0
statgpu/nonparametric/__init__.py +50 -0
statgpu/nonparametric/kernel_methods/__init__.py +25 -0
statgpu/nonparametric/kernel_methods/_kernels.py +246 -0
statgpu/nonparametric/kernel_methods/_krr.py +234 -0
statgpu/nonparametric/kernel_methods/_krr_cv.py +380 -0
statgpu/nonparametric/kernel_smoothing/__init__.py +39 -0
statgpu/nonparametric/kernel_smoothing/_bandwidth_selection.py +1083 -0
statgpu/nonparametric/kernel_smoothing/_kde.py +761 -0
statgpu/nonparametric/kernel_smoothing/_kernel_common.py +348 -0
statgpu/nonparametric/kernel_smoothing/_kernel_regression.py +748 -0
statgpu/nonparametric/splines/__init__.py +5 -0
statgpu/nonparametric/splines/_bspline_basis.py +336 -0
statgpu/nonparametric/splines/_penalized.py +349 -0
statgpu/panel/__init__.py +19 -0
statgpu/panel/_covariance.py +140 -0
statgpu/panel/_fixed_effects.py +420 -0
statgpu/panel/_random_effects.py +385 -0
statgpu/panel/_utils.py +482 -0
statgpu/penalties/__init__.py +139 -0
statgpu/penalties/_adaptive_l1.py +313 -0
statgpu/penalties/_base.py +261 -0
statgpu/penalties/_categories.py +39 -0
statgpu/penalties/_elasticnet.py +98 -0
statgpu/penalties/_group_lasso.py +678 -0
statgpu/penalties/_group_mcp.py +553 -0
statgpu/penalties/_group_scad.py +605 -0
statgpu/penalties/_l1.py +107 -0
statgpu/penalties/_l2.py +77 -0
statgpu/penalties/_mcp.py +237 -0
statgpu/penalties/_scad.py +260 -0
statgpu/semiparametric/__init__.py +5 -0
statgpu/semiparametric/_gam.py +401 -0
statgpu/solvers/__init__.py +24 -0
statgpu/solvers/_admm.py +241 -0
statgpu/solvers/_constants.py +15 -0
statgpu/solvers/_convergence.py +6 -0
statgpu/solvers/_fista.py +436 -0
statgpu/solvers/_fista_bb.py +513 -0
statgpu/solvers/_fista_lla.py +541 -0
statgpu/solvers/_lbfgs.py +206 -0
statgpu/solvers/_newton.py +149 -0
statgpu/solvers/_utils.py +277 -0
statgpu/survival/__init__.py +14 -0
statgpu/survival/_cox.py +3974 -0
statgpu/survival/_cox_breslow_triton_kernel.py +106 -0
statgpu/survival/_cox_cv.py +1159 -0
statgpu/survival/_cox_efron_cuda.py +1280 -0
statgpu/survival/_cox_efron_triton.py +359 -0
statgpu/unsupervised/__init__.py +29 -0
statgpu/unsupervised/_agglomerative.py +307 -0
statgpu/unsupervised/_dbscan.py +263 -0
statgpu/unsupervised/_dbscan_cpu.pyx +125 -0
statgpu/unsupervised/_gmm.py +332 -0
statgpu/unsupervised/_incremental_pca.py +176 -0
statgpu/unsupervised/_kmeans.py +261 -0
statgpu/unsupervised/_minibatch_kmeans.py +299 -0
statgpu/unsupervised/_minibatch_nmf.py +252 -0
statgpu/unsupervised/_nmf.py +190 -0
statgpu/unsupervised/_pca.py +189 -0
statgpu/unsupervised/_truncated_svd.py +132 -0
statgpu/unsupervised/_tsne.py +192 -0
statgpu/unsupervised/_umap.py +224 -0
statgpu/unsupervised/_utils.py +134 -0
statgpu-0.1.0.dist-info/METADATA +245 -0
statgpu-0.1.0.dist-info/RECORD +168 -0
statgpu-0.1.0.dist-info/WHEEL +5 -0
statgpu-0.1.0.dist-info/licenses/LICENSE +199 -0
statgpu-0.1.0.dist-info/top_level.txt +1 -0

statgpu/penalties/_l1.py ADDED Viewed

@@ -0,0 +1,107 @@
+"""
+L1 penalty (Lasso) implementation.
+P(w) = α * ||w||₁
+"""
+__all__ = ["L1Penalty"]
+from typing import Optional
+from statgpu.backends._array_ops import _xp
+import numpy as np
+from statgpu.penalties._base import Penalty
+# ---- torch.compile lazy-loader (fuses elementwise ops into 1 kernel) ---------
+_L1_PROXIMAL_TORCH_COMPILED = None
+def _get_l1_torch_compiled():
+    global _L1_PROXIMAL_TORCH_COMPILED
+    if _L1_PROXIMAL_TORCH_COMPILED is not None:
+        return _L1_PROXIMAL_TORCH_COMPILED
+    from statgpu.penalties import _torch_compile_ok
+    if not _torch_compile_ok():
+        _L1_PROXIMAL_TORCH_COMPILED = None
+        return None
+    try:
+        import torch
+        def _prox(w, thresh):
+            return torch.sign(w) * torch.relu(torch.abs(w) - thresh)
+        _L1_PROXIMAL_TORCH_COMPILED = torch.compile(_prox, mode='reduce-overhead')
+    except Exception:
+        _L1_PROXIMAL_TORCH_COMPILED = None
+    return _L1_PROXIMAL_TORCH_COMPILED
+class L1Penalty(Penalty):
+    """
+    L1 penalty: P(w) = α * ||w||₁
+    The proximal operator is the soft thresholding function:
+        prox_{λ·||·||₁}(z) = sign(z) * max(|z| - λ, 0)
+    """
+    name = "l1"
+    is_convex = True
+    def __init__(self, alpha: float = 1.0):
+        """
+        Parameters
+        ----------
+        alpha : float, default=1.0
+            Regularization strength.
+        """
+        if alpha < 0:
+            raise ValueError(f"alpha must be non-negative, got {alpha}")
+        self.alpha = alpha
+    def value(self, coef):
+        """P(w) = α * Σ|w_j|"""
+        xp = _xp(coef)
+        return self.alpha * float(xp.sum(xp.abs(coef)))
+    def gradient(self, coef):
+        """∇P(w) = α * sign(w)"""
+        xp = _xp(coef)
+        return self.alpha * xp.sign(coef)
+    def proximal(
+        self,
+        w: np.ndarray,
+        step: float,
+        backend: str = "numpy"
+    ) -> np.ndarray:
+        """
+        Soft thresholding: sign(z) * max(|z| - α*step, 0)
+        Parameters
+        ----------
+        w : array
+            Input array.
+        step : float
+            Step size.
+        backend : str
+            Backend: 'numpy', 'cupy', or 'torch'.
+        Returns
+        -------
+        array
+            Soft-thresholded result.
+        """
+        thresh = self.alpha * step
+        # torch.compile fast path (performance optimization)
+        if backend == "torch":
+            compiled_fn = _get_l1_torch_compiled()
+            if compiled_fn is not None:
+                return compiled_fn(w, thresh)
+        # Unified fallback across numpy/cupy/torch
+        from statgpu.backends._array_ops import _soft_threshold
+        return _soft_threshold(w, thresh)
+    def get_params(self) -> dict:
+        params = super().get_params()
+        params["alpha"] = self.alpha
+        return params

statgpu/penalties/_l2.py ADDED Viewed

@@ -0,0 +1,77 @@
+"""
+L2 penalty (Ridge) implementation.
+P(w) = (α/2) * ||w||²₂
+"""
+__all__ = ["L2Penalty"]
+from typing import Optional
+from statgpu.backends._array_ops import _xp
+import numpy as np
+from statgpu.penalties._base import Penalty
+class L2Penalty(Penalty):
+    """
+    L2 penalty (Ridge): P(w) = (α/2) * ||w||²₂
+    The proximal operator has a closed-form solution:
+        prox_{λ·||·||²/2}(z) = z / (1 + λ*step)
+    """
+    name = "l2"
+    is_convex = True
+    def __init__(self, alpha: float = 1.0):
+        """
+        Parameters
+        ----------
+        alpha : float, default=1.0
+            Regularization strength.
+        """
+        if alpha < 0:
+            raise ValueError(f"alpha must be non-negative, got {alpha}")
+        self.alpha = alpha
+    def value(self, coef):
+        """P(w) = (α/2) * Σw_j²"""
+        xp = _xp(coef)
+        return 0.5 * self.alpha * float(xp.sum(coef ** 2))
+    def gradient(self, coef):
+        """∇P(w) = α * w"""
+        return self.alpha * coef
+    def proximal(
+        self,
+        w: np.ndarray,
+        step: float,
+        backend: str = "numpy"
+    ) -> np.ndarray:
+        """
+        Closed-form for L2: w / (1 + α*step)
+        Parameters
+        ----------
+        w : array
+            Input array.
+        step : float
+            Step size.
+        backend : str
+            Backend: 'numpy', 'cupy', or 'torch'.
+        Returns
+        -------
+        array
+            Scaled result.
+        """
+        scale = 1.0 / (1.0 + self.alpha * step)
+        return scale * w
+    def get_params(self) -> dict:
+        params = super().get_params()
+        params["alpha"] = self.alpha
+        return params

statgpu/penalties/_mcp.py ADDED Viewed

@@ -0,0 +1,237 @@
+"""
+MCP penalty (Minimax Concave Penalty).
+Zhang, Annals of Statistics 2010. Non-convex penalty with oracle property.
+Element-wise:
+    p(w_j) = {
+        alpha * |w_j| - w_j^2 / (2*gamma)     if |w_j| <= gamma*alpha
+        (1/2) * gamma * alpha^2                if |w_j| > gamma*alpha
+    }
+Supports both FISTA direct (proximal) and LLA (lla_weights) optimization.
+"""
+__all__ = ["MCPPenalty"]
+from typing import Optional
+import numpy as np
+from statgpu.penalties._base import Penalty
+from statgpu.backends._array_ops import _xp
+from statgpu.backends._utils import _to_float_scalar
+# ---- torch.compile lazy-loader (fuses elementwise ops into 1 kernel) ---------
+_MCP_PROXIMAL_TORCH_COMPILED = None
+def _get_mcp_torch_compiled():
+    global _MCP_PROXIMAL_TORCH_COMPILED
+    if _MCP_PROXIMAL_TORCH_COMPILED is not None:
+        return _MCP_PROXIMAL_TORCH_COMPILED
+    from statgpu.penalties import _torch_compile_ok
+    if not _torch_compile_ok():
+        _MCP_PROXIMAL_TORCH_COMPILED = None
+        return None
+    try:
+        import torch
+        def _prox(w, step, alpha, gamma):
+            max_step = 0.9 * gamma
+            step = torch.clamp(step, max=max_step)
+            t = alpha * step
+            abs_w = torch.abs(w)
+            sign_w = torch.sign(w)
+            r1 = abs_w <= t
+            r3 = abs_w > gamma * alpha
+            r2 = ~(r1 | r3)
+            result = torch.where(r1,
+                torch.zeros_like(w),
+                torch.where(r2,
+                    sign_w * (abs_w - t) / (1.0 - step / gamma),
+                    w))
+            return result
+        _MCP_PROXIMAL_TORCH_COMPILED = torch.compile(_prox, dynamic=True, mode='reduce-overhead')
+    except Exception:
+        _MCP_PROXIMAL_TORCH_COMPILED = None
+    return _MCP_PROXIMAL_TORCH_COMPILED
+class MCPPenalty(Penalty):
+    """MCP penalty.
+    Parameters
+    ----------
+    alpha : float, default=1.0
+        Regularization strength.
+    gamma : float, default=3.0
+        Concavity parameter. Zhang recommends gamma > 1 (default 3.0).
+    Notes
+    -----
+    MCP is **non-convex** (``is_convex=False``). The objective function may
+    contain multiple local minima. Different solvers (e.g. ``fista`` vs
+    ``fista_bb``) can converge to different local minima with comparable
+    objective values — a coefficient ``max|diff|`` up to ~1e-2 is expected
+    and does not indicate a bug. The objective values should agree within
+    ~1e-4 relative tolerance across runs.
+    """
+    name = "mcp"
+    is_convex = False
+    def __init__(self, alpha: float = 1.0, gamma: float = 3.0):
+        if not np.isfinite(alpha) or alpha <= 0.0:
+            raise ValueError("alpha must be a finite positive scalar for MCP penalty")
+        if not np.isfinite(gamma) or gamma <= 1.0:
+            raise ValueError("gamma must be a finite scalar greater than 1 for MCP penalty")
+        self.alpha = alpha
+        self.gamma = gamma
+    # ----------------------------------------------------------------
+    # Value
+    # ----------------------------------------------------------------
+    def value(self, coef: np.ndarray) -> float:
+        xp = _xp(coef)
+        alpha = self.alpha
+        gamma = self.gamma
+        abs_w = xp.abs(coef)
+        region1 = abs_w <= gamma * alpha
+        region2 = ~region1
+        total = xp.sum(alpha * abs_w[region1] - abs_w[region1] ** 2 / (2.0 * gamma))
+        total = total + 0.5 * gamma * alpha ** 2 * xp.sum(region2)
+        return _to_float_scalar(total)
+    # ----------------------------------------------------------------
+    # Gradient
+    # ----------------------------------------------------------------
+    def gradient(self, coef):
+        xp = _xp(coef)
+        abs_w = xp.abs(coef)
+        sign_w = xp.sign(coef)
+        alpha = self.alpha
+        gamma = self.gamma
+        grad = xp.zeros_like(coef, dtype=coef.dtype if hasattr(coef, 'dtype') else float)
+        mask1 = abs_w <= gamma * alpha
+        grad[mask1] = sign_w[mask1] * (alpha - abs_w[mask1] / gamma)
+        return grad
+    # ----------------------------------------------------------------
+    # Proximal operator (FISTA direct path)
+    # ----------------------------------------------------------------
+    # Lazy-loaded fused CuPy kernel (single launch vs ~10 intermediate arrays)
+    _MCP_PROXIMAL_CUPY = None
+    def proximal(
+        self,
+        w,
+        step: float,
+        backend: str = "numpy",
+    ):
+        """Closed-form MCP proximal operator (three regions per coordinate).
+        Clamp step < gamma so the three-region formula always applies.
+        """
+        alpha = self.alpha
+        gamma = self.gamma
+        max_step = 0.9 * gamma
+        if step > max_step:
+            step = max_step
+        t = alpha * step
+        if backend == "cupy":
+            import cupy as cp
+            if MCPPenalty._MCP_PROXIMAL_CUPY is None:
+                MCPPenalty._MCP_PROXIMAL_CUPY = cp.ElementwiseKernel(
+                    'float64 w, float64 step, float64 alpha, float64 gamma',
+                    'float64 result',
+                    '''
+                    double max_step = 0.9 * gamma;
+                    double s = (step > max_step) ? max_step : step;
+                    double abs_w = abs(w);
+                    double t = alpha * s;
+                    double sign_w = (w > 0.0) ? 1.0 : ((w < 0.0) ? -1.0 : 0.0);
+                    if (abs_w <= t) {
+                        result = 0.0;
+                    } else if (abs_w <= gamma * alpha) {
+                        result = sign_w * (abs_w - t) / (1.0 - s / gamma);
+                    } else {
+                        result = w;
+                    }
+                    ''',
+                    'mcp_proximal',
+                )
+            return MCPPenalty._MCP_PROXIMAL_CUPY(w, step, alpha, gamma)
+        elif backend == "torch":
+            import torch
+            compiled_fn = _get_mcp_torch_compiled()
+            if compiled_fn is not None:
+                step_t = torch.as_tensor(step, dtype=w.dtype, device=w.device)
+                return compiled_fn(w, step_t, alpha, gamma)
+            abs_w = torch.abs(w)
+            sign_w = torch.sign(w)
+            r1 = abs_w <= t
+            r3 = abs_w > gamma * alpha
+            r2 = ~(r1 | r3)
+            result = torch.where(r1,
+                torch.zeros_like(w),
+                torch.where(r2,
+                    sign_w * (abs_w - t) / (1.0 - step / gamma),
+                    w))
+            return result
+        else:
+            abs_w = np.abs(w)
+            sign_w = np.sign(w)
+            region1 = abs_w <= t
+            region3 = abs_w > gamma * alpha
+            region2 = ~(region1 | region3)
+            result = np.zeros_like(w, dtype=float)
+            result[region2] = (
+                sign_w[region2]
+                * (abs_w[region2] - t)
+                / (1.0 - step / gamma)
+            )
+            result[region3] = w[region3]
+            return result
+    # ----------------------------------------------------------------
+    # LLA weights (Local Linear Approximation path)
+    # ----------------------------------------------------------------
+    def lla_weights(self, coef):
+        """
+        LLA weights: w_j = P'(|coef_j|) — the subgradient of MCP at |coef_j|.
+        w_j = {
+            alpha - |coef_j| / gamma   if |coef_j| <= gamma*alpha
+            0                           if |coef_j| > gamma*alpha
+        }
+        Accepts numpy, cupy, or torch arrays. Returns same backend type.
+        """
+        alpha = self.alpha
+        gamma = self.gamma
+        xp = _xp(coef)
+        abs_w = xp.abs(coef)
+        weights = xp.zeros_like(coef)
+        mask = abs_w <= gamma * alpha
+        weights[mask] = alpha - abs_w[mask] / gamma
+        return weights
+    # ----------------------------------------------------------------
+    def get_params(self) -> dict:
+        params = super().get_params()
+        params.update({"alpha": self.alpha, "gamma": self.gamma})
+        return params

statgpu/penalties/_scad.py ADDED Viewed

@@ -0,0 +1,260 @@
+"""
+SCAD penalty (Smoothly Clipped Absolute Deviation).
+Fan & Li, JASA 2001. Non-convex penalty with oracle property.
+Element-wise:
+    p(w_j) = {
+        alpha * |w_j|                                         if |w_j| <= alpha
+        -(w_j^2 - 2*a*alpha*|w_j| + alpha^2) / (2*(a-1))     if alpha < |w_j| <= a*alpha
+        (a+1)*alpha^2 / 2                                     if |w_j| > a*alpha
+    }
+Supports both FISTA direct (proximal) and LLA (lla_weights) optimization.
+"""
+__all__ = ["SCADPenalty"]
+from typing import Optional
+import numpy as np
+from statgpu.penalties._base import Penalty
+from statgpu.backends._array_ops import _xp
+from statgpu.backends._utils import _to_float_scalar
+# ---- torch.compile lazy-loader (fuses elementwise ops into 1 kernel) ---------
+_SCAD_PROXIMAL_TORCH_COMPILED = None
+def _get_scad_torch_compiled():
+    global _SCAD_PROXIMAL_TORCH_COMPILED
+    if _SCAD_PROXIMAL_TORCH_COMPILED is not None:
+        return _SCAD_PROXIMAL_TORCH_COMPILED
+    from statgpu.penalties import _torch_compile_ok
+    if not _torch_compile_ok():
+        _SCAD_PROXIMAL_TORCH_COMPILED = None
+        return None
+    try:
+        import torch
+        def _prox(w, step, alpha, a):
+            max_step = 0.9 * (a - 1.0)
+            step = torch.clamp(step, max=max_step)
+            t = alpha * step
+            abs_w = torch.abs(w)
+            sign_w = torch.sign(w)
+            r1 = abs_w <= alpha + t
+            r3 = abs_w > a * alpha
+            r2 = ~(r1 | r3)
+            result = torch.where(r1,
+                sign_w * torch.relu(abs_w - t),
+                torch.where(r2,
+                    sign_w * ((a - 1.0) * abs_w - a * t) / (a - 1.0 - step),
+                    w))
+            return result
+        _SCAD_PROXIMAL_TORCH_COMPILED = torch.compile(_prox, dynamic=True, mode='reduce-overhead')
+    except Exception:
+        _SCAD_PROXIMAL_TORCH_COMPILED = None
+    return _SCAD_PROXIMAL_TORCH_COMPILED
+class SCADPenalty(Penalty):
+    """SCAD penalty.
+    Parameters
+    ----------
+    alpha : float, default=1.0
+        Regularization strength.
+    a : float, default=3.7
+        Concavity parameter. Fan & Li recommend 3.7.
+    Notes
+    -----
+    SCAD is **non-convex** (``is_convex=False``). The objective function may
+    contain multiple local minima. Different solvers (e.g. ``fista`` vs
+    ``fista_bb``) can converge to different local minima with comparable
+    objective values — a coefficient ``max|diff|`` up to ~1e-2 is expected
+    and does not indicate a bug. The objective values should agree within
+    ~1e-4 relative tolerance across runs.
+    """
+    name = "scad"
+    is_convex = False
+    def __init__(self, alpha: float = 1.0, a: float = 3.7):
+        if not np.isfinite(alpha) or alpha <= 0.0:
+            raise ValueError("alpha must be a finite positive scalar for SCAD penalty")
+        if not np.isfinite(a) or a <= 2.0:
+            raise ValueError("a must be a finite scalar greater than 2 for SCAD penalty")
+        self.alpha = alpha
+        self.a = a
+    # ----------------------------------------------------------------
+    # Value
+    # ----------------------------------------------------------------
+    def value(self, coef: np.ndarray) -> float:
+        xp = _xp(coef)
+        a = self.a
+        alpha = self.alpha
+        abs_w = xp.abs(coef)
+        region1 = abs_w <= alpha
+        region2 = (abs_w > alpha) & (abs_w <= a * alpha)
+        region3 = abs_w > a * alpha
+        total = alpha * xp.sum(abs_w[region1])
+        total = total + xp.sum(
+            -(abs_w[region2] ** 2 - 2 * a * alpha * abs_w[region2] + alpha ** 2)
+            / (2.0 * (a - 1.0))
+        )
+        total = total + (a + 1.0) * alpha ** 2 / 2.0 * xp.sum(region3)
+        return _to_float_scalar(total)
+    # ----------------------------------------------------------------
+    # Gradient
+    # ----------------------------------------------------------------
+    def gradient(self, coef):
+        xp = _xp(coef)
+        abs_w = xp.abs(coef)
+        sign_w = xp.sign(coef)
+        a = self.a
+        alpha = self.alpha
+        grad = xp.zeros_like(coef, dtype=coef.dtype if hasattr(coef, 'dtype') else float)
+        # Region 1: |w| <= alpha → alpha * sign(w)
+        mask1 = abs_w <= alpha
+        grad[mask1] = alpha * sign_w[mask1]
+        # Region 2: alpha < |w| <= a*alpha → (a*alpha*sign - w) / (a-1)
+        mask2 = (abs_w > alpha) & (abs_w <= a * alpha)
+        grad[mask2] = (a * alpha * sign_w[mask2] - coef[mask2]) / (a - 1.0)
+        # Region 3: |w| > a*alpha → 0
+        return grad
+    # ----------------------------------------------------------------
+    # Proximal operator (FISTA direct path)
+    # ----------------------------------------------------------------
+    # Lazy-loaded fused CuPy kernel (single launch vs ~15 intermediate arrays)
+    _SCAD_PROXIMAL_CUPY = None
+    def proximal(
+        self,
+        w,
+        step: float,
+        backend: str = "numpy",
+    ):
+        """Closed-form SCAD proximal operator (three regions per coordinate).
+        When step > a-1 the three-region formula degenerates (division by
+        zero or negative denominator).  Clamp step so the three-region
+        logic always applies — this matches R ncvreg's per-coordinate
+        behaviour where each coordinate has its own step v_j and the
+        threshold is always alpha (never alpha*v_j).
+        """
+        alpha = self.alpha
+        a = self.a
+        # Clamp step: ensure a > 1 + step (three-region condition).
+        # Use 0.9*(a-1) as max to avoid the singularity at step = a-1.
+        max_step = 0.9 * (a - 1.0)
+        if step > max_step:
+            step = max_step
+        t = alpha * step
+        if backend == "cupy":
+            import cupy as cp
+            if SCADPenalty._SCAD_PROXIMAL_CUPY is None:
+                SCADPenalty._SCAD_PROXIMAL_CUPY = cp.ElementwiseKernel(
+                    'float64 w, float64 step, float64 alpha, float64 a',
+                    'float64 result',
+                    '''
+                    double max_step = 0.9 * (a - 1.0);
+                    double s = (step > max_step) ? max_step : step;
+                    double abs_w = abs(w);
+                    double t = alpha * s;
+                    double sign_w = (w > 0.0) ? 1.0 : ((w < 0.0) ? -1.0 : 0.0);
+                    if (abs_w <= alpha + t) {
+                        double v = abs_w - t;
+                        result = sign_w * (v > 0.0 ? v : 0.0);
+                    } else if (abs_w <= a * alpha) {
+                        result = sign_w * ((a - 1.0) * abs_w - a * t) / (a - 1.0 - s);
+                    } else {
+                        result = w;
+                    }
+                    ''',
+                    'scad_proximal',
+                )
+            return SCADPenalty._SCAD_PROXIMAL_CUPY(w, step, alpha, a)
+        elif backend == "torch":
+            import torch
+            compiled_fn = _get_scad_torch_compiled()
+            if compiled_fn is not None:
+                step_t = torch.as_tensor(step, dtype=w.dtype, device=w.device)
+                return compiled_fn(w, step_t, alpha, a)
+            abs_w = torch.abs(w)
+            sign_w = torch.sign(w)
+            r1 = abs_w <= alpha + t
+            r3 = abs_w > a * alpha
+            r2 = ~(r1 | r3)
+            result = torch.where(r1,
+                sign_w * torch.relu(abs_w - t),
+                torch.where(r2,
+                    sign_w * ((a - 1.0) * abs_w - a * t) / (a - 1.0 - step),
+                    w))
+            return result
+        else:
+            abs_w = np.abs(w)
+            sign_w = np.sign(w)
+            region1 = abs_w <= alpha + t
+            region3 = abs_w > a * alpha
+            region2 = ~(region1 | region3)
+            result = np.zeros_like(w, dtype=float)
+            result[region1] = sign_w[region1] * np.maximum(abs_w[region1] - t, 0.0)
+            result[region2] = (
+                sign_w[region2]
+                * ((a - 1.0) * abs_w[region2] - a * t)
+                / (a - 1.0 - step)
+            )
+            result[region3] = w[region3]
+            return result
+    # ----------------------------------------------------------------
+    # LLA weights (Local Linear Approximation path)
+    # ----------------------------------------------------------------
+    def lla_weights(self, coef):
+        """
+        LLA weights: w_j = P'(|coef_j|) — the subgradient of SCAD at |coef_j|.
+        w_j = {
+            alpha                            if |coef_j| <= alpha
+            (a*alpha - |coef_j|) / (a - 1)   if alpha < |coef_j| <= a*alpha
+            0                                 if |coef_j| > a*alpha
+        }
+        Accepts numpy, cupy, or torch arrays. Returns same backend type.
+        """
+        a = self.a
+        alpha = self.alpha
+        xp = _xp(coef)
+        abs_w = xp.abs(coef)
+        weights = xp.full_like(coef, alpha)
+        mask2 = (abs_w > alpha) & (abs_w <= a * alpha)
+        weights[mask2] = (a * alpha - abs_w[mask2]) / (a - 1.0)
+        mask3 = abs_w > a * alpha
+        weights[mask3] = 0.0
+        return weights
+    # ----------------------------------------------------------------
+    def get_params(self) -> dict:
+        params = super().get_params()
+        params.update({"alpha": self.alpha, "a": self.a})
+        return params

statgpu/semiparametric/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Semiparametric models with GPU support."""
+from ._gam import GAM
+__all__ = ['GAM']