PyPI - statgpu - Versions diffs - 0.1.0__py3-none-any.whl - Mend

statgpu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (168) hide show

statgpu/__init__.py +174 -0
statgpu/_base.py +544 -0
statgpu/_config.py +127 -0
statgpu/anova/__init__.py +5 -0
statgpu/anova/_oneway.py +194 -0
statgpu/backends/__init__.py +83 -0
statgpu/backends/_array_ops.py +529 -0
statgpu/backends/_base.py +184 -0
statgpu/backends/_cupy.py +453 -0
statgpu/backends/_factory.py +65 -0
statgpu/backends/_gpu_inference_cupy.py +214 -0
statgpu/backends/_gpu_inference_torch.py +422 -0
statgpu/backends/_numpy.py +324 -0
statgpu/backends/_torch.py +685 -0
statgpu/backends/_torch_safe.py +47 -0
statgpu/backends/_utils.py +423 -0
statgpu/core/__init__.py +10 -0
statgpu/core/formula/__init__.py +33 -0
statgpu/core/formula/_design.py +99 -0
statgpu/core/formula/_parser.py +191 -0
statgpu/core/formula/_terms.py +70 -0
statgpu/core/formula/tests/__init__.py +0 -0
statgpu/core/formula/tests/test_parser.py +194 -0
statgpu/covariance/__init__.py +6 -0
statgpu/covariance/_empirical.py +310 -0
statgpu/covariance/_shrinkage.py +248 -0
statgpu/cross_validation/__init__.py +31 -0
statgpu/cross_validation/_base.py +410 -0
statgpu/cross_validation/_engine.py +167 -0
statgpu/diagnostics/__init__.py +7 -0
statgpu/diagnostics/_regression_diagnostics.py +188 -0
statgpu/feature_selection/__init__.py +24 -0
statgpu/feature_selection/_knockoff.py +870 -0
statgpu/feature_selection/_knockoff_utils.py +1003 -0
statgpu/feature_selection/_stepwise.py +300 -0
statgpu/glm_core/__init__.py +81 -0
statgpu/glm_core/_base.py +202 -0
statgpu/glm_core/_family.py +362 -0
statgpu/glm_core/_fused.py +149 -0
statgpu/glm_core/_gamma.py +111 -0
statgpu/glm_core/_inverse_gaussian.py +62 -0
statgpu/glm_core/_irls.py +561 -0
statgpu/glm_core/_logistic.py +82 -0
statgpu/glm_core/_negative_binomial.py +68 -0
statgpu/glm_core/_poisson.py +60 -0
statgpu/glm_core/_solver_legacy.py +100 -0
statgpu/glm_core/_squared.py +53 -0
statgpu/glm_core/_tweedie.py +74 -0
statgpu/inference/__init__.py +239 -0
statgpu/inference/_distributions_backend.py +2610 -0
statgpu/inference/_multiple_testing.py +391 -0
statgpu/inference/_resampling.py +1400 -0
statgpu/inference/_results.py +265 -0
statgpu/linear_model/__init__.py +75 -0
statgpu/linear_model/_gaussian_inference.py +306 -0
statgpu/linear_model/_glm_base.py +1261 -0
statgpu/linear_model/_ordered_logit.py +52 -0
statgpu/linear_model/_ordered_probit.py +50 -0
statgpu/linear_model/_stats.py +170 -0
statgpu/linear_model/cv/__init__.py +13 -0
statgpu/linear_model/cv/_elasticnet_cv.py +892 -0
statgpu/linear_model/cv/_lasso_cv.py +253 -0
statgpu/linear_model/cv/_logistic_cv.py +895 -0
statgpu/linear_model/cv/_ridge_cv.py +1160 -0
statgpu/linear_model/legacy/__init__.py +1 -0
statgpu/linear_model/legacy/_distributions_legacy_gpu.py +340 -0
statgpu/linear_model/legacy/_elasticnet_legacy.py +936 -0
statgpu/linear_model/legacy/_lasso_legacy.py +4876 -0
statgpu/linear_model/legacy/_penalized_legacy.py +1174 -0
statgpu/linear_model/legacy/_ridge_legacy.py +863 -0
statgpu/linear_model/legacy/_solver_legacy.py +104 -0
statgpu/linear_model/penalized/__init__.py +25 -0
statgpu/linear_model/penalized/_base.py +437 -0
statgpu/linear_model/penalized/_fit_mixin.py +1877 -0
statgpu/linear_model/penalized/_inference_mixin.py +1179 -0
statgpu/linear_model/penalized/_penalized_cv.py +2699 -0
statgpu/linear_model/penalized/_penalized_gamma.py +86 -0
statgpu/linear_model/penalized/_penalized_inverse_gaussian.py +62 -0
statgpu/linear_model/penalized/_penalized_linear.py +236 -0
statgpu/linear_model/penalized/_penalized_logistic.py +100 -0
statgpu/linear_model/penalized/_penalized_negative_binomial.py +65 -0
statgpu/linear_model/penalized/_penalized_poisson.py +62 -0
statgpu/linear_model/penalized/_penalized_tweedie.py +65 -0
statgpu/linear_model/penalized/_predict_mixin.py +182 -0
statgpu/linear_model/wrappers/__init__.py +31 -0
statgpu/linear_model/wrappers/_adaptive_lasso.py +63 -0
statgpu/linear_model/wrappers/_elasticnet.py +75 -0
statgpu/linear_model/wrappers/_gamma.py +67 -0
statgpu/linear_model/wrappers/_inverse_gaussian.py +47 -0
statgpu/linear_model/wrappers/_lasso.py +2124 -0
statgpu/linear_model/wrappers/_linear.py +1127 -0
statgpu/linear_model/wrappers/_logistic.py +1435 -0
statgpu/linear_model/wrappers/_mcp.py +58 -0
statgpu/linear_model/wrappers/_negative_binomial.py +58 -0
statgpu/linear_model/wrappers/_poisson.py +48 -0
statgpu/linear_model/wrappers/_ridge.py +166 -0
statgpu/linear_model/wrappers/_scad.py +58 -0
statgpu/linear_model/wrappers/_tweedie.py +57 -0
statgpu/metrics/__init__.py +21 -0
statgpu/metrics/_classification.py +591 -0
statgpu/nonparametric/__init__.py +50 -0
statgpu/nonparametric/kernel_methods/__init__.py +25 -0
statgpu/nonparametric/kernel_methods/_kernels.py +246 -0
statgpu/nonparametric/kernel_methods/_krr.py +234 -0
statgpu/nonparametric/kernel_methods/_krr_cv.py +380 -0
statgpu/nonparametric/kernel_smoothing/__init__.py +39 -0
statgpu/nonparametric/kernel_smoothing/_bandwidth_selection.py +1083 -0
statgpu/nonparametric/kernel_smoothing/_kde.py +761 -0
statgpu/nonparametric/kernel_smoothing/_kernel_common.py +348 -0
statgpu/nonparametric/kernel_smoothing/_kernel_regression.py +748 -0
statgpu/nonparametric/splines/__init__.py +5 -0
statgpu/nonparametric/splines/_bspline_basis.py +336 -0
statgpu/nonparametric/splines/_penalized.py +349 -0
statgpu/panel/__init__.py +19 -0
statgpu/panel/_covariance.py +140 -0
statgpu/panel/_fixed_effects.py +420 -0
statgpu/panel/_random_effects.py +385 -0
statgpu/panel/_utils.py +482 -0
statgpu/penalties/__init__.py +139 -0
statgpu/penalties/_adaptive_l1.py +313 -0
statgpu/penalties/_base.py +261 -0
statgpu/penalties/_categories.py +39 -0
statgpu/penalties/_elasticnet.py +98 -0
statgpu/penalties/_group_lasso.py +678 -0
statgpu/penalties/_group_mcp.py +553 -0
statgpu/penalties/_group_scad.py +605 -0
statgpu/penalties/_l1.py +107 -0
statgpu/penalties/_l2.py +77 -0
statgpu/penalties/_mcp.py +237 -0
statgpu/penalties/_scad.py +260 -0
statgpu/semiparametric/__init__.py +5 -0
statgpu/semiparametric/_gam.py +401 -0
statgpu/solvers/__init__.py +24 -0
statgpu/solvers/_admm.py +241 -0
statgpu/solvers/_constants.py +15 -0
statgpu/solvers/_convergence.py +6 -0
statgpu/solvers/_fista.py +436 -0
statgpu/solvers/_fista_bb.py +513 -0
statgpu/solvers/_fista_lla.py +541 -0
statgpu/solvers/_lbfgs.py +206 -0
statgpu/solvers/_newton.py +149 -0
statgpu/solvers/_utils.py +277 -0
statgpu/survival/__init__.py +14 -0
statgpu/survival/_cox.py +3974 -0
statgpu/survival/_cox_breslow_triton_kernel.py +106 -0
statgpu/survival/_cox_cv.py +1159 -0
statgpu/survival/_cox_efron_cuda.py +1280 -0
statgpu/survival/_cox_efron_triton.py +359 -0
statgpu/unsupervised/__init__.py +29 -0
statgpu/unsupervised/_agglomerative.py +307 -0
statgpu/unsupervised/_dbscan.py +263 -0
statgpu/unsupervised/_dbscan_cpu.pyx +125 -0
statgpu/unsupervised/_gmm.py +332 -0
statgpu/unsupervised/_incremental_pca.py +176 -0
statgpu/unsupervised/_kmeans.py +261 -0
statgpu/unsupervised/_minibatch_kmeans.py +299 -0
statgpu/unsupervised/_minibatch_nmf.py +252 -0
statgpu/unsupervised/_nmf.py +190 -0
statgpu/unsupervised/_pca.py +189 -0
statgpu/unsupervised/_truncated_svd.py +132 -0
statgpu/unsupervised/_tsne.py +192 -0
statgpu/unsupervised/_umap.py +224 -0
statgpu/unsupervised/_utils.py +134 -0
statgpu-0.1.0.dist-info/METADATA +245 -0
statgpu-0.1.0.dist-info/RECORD +168 -0
statgpu-0.1.0.dist-info/WHEEL +5 -0
statgpu-0.1.0.dist-info/licenses/LICENSE +199 -0
statgpu-0.1.0.dist-info/top_level.txt +1 -0

statgpu/glm_core/_irls.py ADDED Viewed

@@ -0,0 +1,561 @@
+"""
+Unified IRLS solver for GLM.
+Extracted from the duplicated IRLS loops in _logistic.py across CPU/GPU/Torch.
+Single implementation works on numpy/cupy/torch backends via auto detection.
+"""
+import warnings
+from typing import Optional
+import numpy as np
+def _infer_backend(X):
+    """Detect backend from array type."""
+    mod = type(X).__module__
+    if mod.startswith("cupy"):
+        return "cupy"
+    if mod.startswith("torch"):
+        return "torch"
+    return "numpy"
+def _solve(A, b, backend="auto"):
+    """Solve linear system, fallback to lstsq if singular."""
+    if backend == "auto":
+        backend = _infer_backend(A)
+    try:
+        if backend == "torch":
+            import torch
+            b_col = b.unsqueeze(1) if b.ndim == 1 else b
+            sol = torch.linalg.solve(A, b_col)
+            return sol.squeeze(1) if b.ndim == 1 else sol
+        elif backend == "cupy":
+            import cupy as cp
+            return cp.linalg.solve(A, b)
+        else:
+            return np.linalg.solve(A, b)
+    except (np.linalg.LinAlgError, ValueError, RuntimeError):
+        if backend == "torch":
+            import torch
+            b_col = b.unsqueeze(1) if b.ndim == 1 else b
+            sol = torch.linalg.lstsq(A, b_col).solution
+            return sol.squeeze(1) if b.ndim == 1 else sol
+        elif backend == "cupy":
+            import cupy as cp
+            return cp.linalg.lstsq(A, b)[0]
+        return np.linalg.lstsq(A, b, rcond=None)[0]
+def _clip(x, lo, hi, backend):
+    if backend == "torch":
+        import torch
+        lo_val = lo if lo is not None else float('-inf')
+        hi_val = hi if hi is not None else float('inf')
+        return torch.clamp(x, min=lo_val, max=hi_val)
+    if backend == "cupy":
+        import cupy as cp
+        return cp.clip(x, lo, hi)
+    return np.clip(x, lo, hi)
+def _norm(x, backend):
+    if backend == "torch":
+        import torch
+        return float(torch.linalg.norm(x).item())
+    return float(np.linalg.norm(x))
+def _zeros(n, backend, ref_tensor=None, dtype=np.float64):
+    if backend == "cupy":
+        import cupy as cp
+        return cp.zeros(n, dtype=cp.float64)
+    if backend == "torch":
+        import torch
+        device = ref_tensor.device if ref_tensor is not None else "cpu"
+        return torch.zeros(n, dtype=torch.float64, device=device)
+    return np.zeros(n, dtype=dtype)
+def _diag(reg, backend, ref_tensor=None):
+    """Create diagonal matrix from 1D array."""
+    if backend == "cupy":
+        import cupy as cp
+        return cp.diag(cp.asarray(reg, dtype=cp.float64))
+    if backend == "torch":
+        import torch
+        return torch.diag(
+            torch.tensor(reg, dtype=torch.float64, device=ref_tensor.device if ref_tensor is not None else "cpu")
+        )
+    return np.diag(reg)
+def _to_backend(arr, backend, ref_tensor):
+    """Convert numpy array to the target backend."""
+    if backend == "cupy":
+        import cupy as cp
+        return cp.asarray(arr, dtype=cp.float64)
+    if backend == "torch":
+        import torch
+        return torch.tensor(arr, dtype=torch.float64, device=ref_tensor.device if ref_tensor is not None else "cpu")
+    return np.asarray(arr, dtype=float)
+def _copy_arr(arr):
+    """Copy array: .clone() for torch, .copy() for numpy/cupy."""
+    if hasattr(arr, 'clone'):
+        return arr.clone()
+    return arr.copy()
+# =============================================================================
+# Torch.compile for IRLS elementwise chain fusion
+# =============================================================================
+# When backend is torch on CUDA, the per-iteration elementwise ops
+# (link inverse, weight computation, working response, weighted matmul)
+# can be fused via torch.compile to reduce kernel launch overhead.
+_IRLS_STEP_COMPILED = None
+def _torch_compile_supported():
+    """Check if torch.compile is safe (CUDA Capability >= 7.0)."""
+    try:
+        import torch
+        if torch.cuda.is_available():
+            cap = torch.cuda.get_device_capability()
+            return cap[0] >= 7
+    except Exception:
+        pass
+    return True
+def _get_irls_step_compiled():
+    """Lazily create a torch.compile'd IRLS step function."""
+    global _IRLS_STEP_COMPILED
+    if _IRLS_STEP_COMPILED is not None:
+        return _IRLS_STEP_COMPILED
+    import torch
+    def _irls_weighted_gemm(X, W, z):
+        """Weighted X'WX and X'Wz — elementwise ops fused by torch.compile."""
+        W_col = W.unsqueeze(1)
+        XtWX = X.T @ (X * W_col)
+        Xtz = X.T @ (W * z)
+        return XtWX, Xtz
+    if _torch_compile_supported():
+        try:
+            _IRLS_STEP_COMPILED = torch.compile(_irls_weighted_gemm, dynamic=True, fullgraph=False)
+        except Exception:
+            _IRLS_STEP_COMPILED = _irls_weighted_gemm
+    else:
+        _IRLS_STEP_COMPILED = _irls_weighted_gemm
+    return _IRLS_STEP_COMPILED
+def _irls_step_call(compiled_fn, *args):
+    """Call compiled IRLS step, falling back to eager on GPU arch mismatch."""
+    try:
+        return compiled_fn(*args)
+    except Exception:
+        def _irls_gemm_eager(X, W, z):
+            W_col = W.unsqueeze(1)
+            XtWX = X.T @ (X * W_col)
+            Xtz = X.T @ (W * z)
+            return XtWX, Xtz
+        return _irls_gemm_eager(*args)
+def irls_solver(
+    family,
+    X,
+    y,
+    max_iter=100,
+    tol=1e-4,
+    init_coef=None,
+    sample_weight=None,
+    ridge_alpha=0.0,
+    ridge_penalize_intercept=False,
+    backend="auto",
+    penalty_matrix=None,
+):
+    """IRLS: solve GLM by iteratively weighted least squares.
+    Parameters
+    ----------
+    family : Family
+        GLM family with link/variance/irls_* methods.
+    X : array
+        Design matrix (n_samples, n_features).
+    y : array
+        Target (n_samples,).
+    max_iter : int
+        Maximum iterations.
+    tol : float
+        Convergence tolerance on parameter change.
+    init_coef : array, optional
+        Initial coefficient vector.
+    sample_weight : array, optional
+        Sample weights.
+    ridge_alpha : float
+        L2 regularization (lambda = 1/(2*C) format).
+    ridge_penalize_intercept : bool
+        Whether to penalize the intercept.
+    backend : str
+        'numpy', 'cupy', 'torch', or 'auto'.
+    penalty_matrix : array, optional
+        Additional penalty matrix to add to the normal equations.
+        Shape must be (n_features, n_features). When provided, the
+        normal equations become: X'WX + ridge_alpha*I + penalty_matrix.
+    Returns
+    -------
+    params : array
+        Fitted parameters.
+    n_iter : int
+        Number of iterations.
+    """
+    if backend == "auto":
+        backend = _infer_backend(X)
+    if init_coef is None:
+        n_features = X.shape[1]
+        params = _zeros(n_features, backend, ref_tensor=X)
+    else:
+        params = init_coef
+    iteration = 0
+    for iteration in range(max_iter):
+        params_old = _copy_arr(params)
+        # Step 1: linear predictor (clip eta to prevent exp overflow)
+        # For identity link (squared_error), skip clipping — mu = eta = X@params
+        # and clipping distorts the OLS solution.
+        eta_raw = X @ params
+        _link_name = getattr(family.link, 'name', '')
+        if _link_name in ('identity', 'Identity'):
+            eta = eta_raw
+        else:
+            eta = _clip(eta_raw, -30, 30, backend)
+        # Step 2: inverse link -> mean (clip mu to prevent extreme weights)
+        # For identity link (squared_error), skip clipping — mu = eta.
+        mu = family.link.inverse(eta)
+        if _link_name not in ('identity', 'Identity'):
+            mu = _clip(mu, 1e-10, 1e6, backend)
+        # Step 3: IRLS weights
+        W = family.irls_weights(mu, y)
+        W = _clip(W, 1e-10, None, backend)
+        if sample_weight is not None:
+            sw = _to_backend(sample_weight, backend, X)
+            W = W * sw
+        # Step 4: working response
+        z = family.irls_working_response(mu, y, eta)
+        # Step 5: weighted least squares (X'WX + lambda*I) params = X'Wz
+        if backend == "torch":
+            import torch
+            W_col = W.unsqueeze(1)
+            _compiled_step = _get_irls_step_compiled()
+            XtWX, Xtz = _irls_step_call(_compiled_step, X, W, z)
+        else:
+            if backend == "cupy":
+                import cupy as cp
+                W_col = W[:, cp.newaxis]
+            else:
+                W_col = W[:, np.newaxis]
+            XtWX = X.T @ (X * W_col)
+            Xtz = X.T @ (W * z)
+        if ridge_alpha > 0:
+            reg = np.full(XtWX.shape[0], ridge_alpha)
+            if not ridge_penalize_intercept:
+                reg[0] = 0.0
+            XtWX = XtWX + _diag(reg, backend, ref_tensor=X)
+        # Add penalty matrix if provided (e.g., for spline smoothing)
+        if penalty_matrix is not None:
+            XtWX = XtWX + _to_backend(penalty_matrix, backend, X)
+        params_new = _solve(XtWX, Xtz, backend)
+        # Armijo backtracking line search: find step in (0, 1] that
+        # gives sufficient decrease in the loss (deviance).
+        _fname = getattr(family, 'name', '')
+        _tweedie_power = float(getattr(family, 'power', 1.5)) if _fname == "tweedie" else 0.0
+        _nb_alpha = float(getattr(family, 'alpha', 1.0)) if _fname == "negative_binomial" else 0.0
+        _y_backend = _to_backend(y, backend, X)
+        def _dev_val(mu_arr):
+            """Compute family-specific deviance (lower is better).
+            Returns device-side value (no GPU→CPU sync) for torch/cupy.
+            Correct Tweedie deviance for power p (p != 1, p != 2):
+              d(y, mu) = y*(y^(1-p) - mu^(1-p))/(1-p) - (y^(2-p) - mu^(2-p))/(2-p)
+            """
+            _y = _y_backend
+            if backend == "torch":
+                import torch
+                if _fname in ("gaussian", "squared_error"):
+                    return torch.sum((_y - mu_arr) ** 2)
+                elif _fname == "gamma":
+                    return torch.sum(_y / mu_arr - torch.log(_y / mu_arr) - 1.0)
+                elif _fname == "inverse_gaussian":
+                    return torch.sum((_y - mu_arr) ** 2 / (_y * mu_arr ** 2))
+                elif _fname == "negative_binomial":
+                    _mu_c = torch.clamp(mu_arr, min=1e-10)
+                    _y_c = torch.clamp(_y, min=1e-10)
+                    _a = _nb_alpha
+                    return torch.sum(
+                        2.0 * (_y_c * torch.log(_y_c / _mu_c)
+                               - (_y_c + 1.0 / _a) * torch.log((1.0 + _a * _y_c) / (1.0 + _a * _mu_c)))
+                    )
+                elif _fname == "tweedie":
+                    p = _tweedie_power
+                    if abs(p - 1.0) < 0.01:
+                        return torch.sum(mu_arr - _y * torch.log(mu_arr))
+                    elif abs(p - 2.0) < 0.01:
+                        return torch.sum(_y / mu_arr - torch.log(_y / mu_arr) - 1.0)
+                    else:
+                        return torch.sum(
+                            _y * (torch.pow(_y, 1.0 - p) - torch.pow(mu_arr, 1.0 - p)) / (1.0 - p)
+                            - (torch.pow(_y, 2.0 - p) - torch.pow(mu_arr, 2.0 - p)) / (2.0 - p)
+                        )
+                else:
+                    return torch.sum(mu_arr - _y * torch.log(mu_arr))
+            elif backend == "cupy":
+                import cupy as cp
+                if _fname in ("gaussian", "squared_error"):
+                    return cp.sum((_y - mu_arr) ** 2)
+                elif _fname == "gamma":
+                    return cp.sum(_y / mu_arr - cp.log(_y / mu_arr) - 1.0)
+                elif _fname == "inverse_gaussian":
+                    return cp.sum((_y - mu_arr) ** 2 / (_y * mu_arr ** 2))
+                elif _fname == "negative_binomial":
+                    _mu_c = cp.clip(mu_arr, 1e-10)
+                    _y_c = cp.clip(_y, 1e-10)
+                    _a = _nb_alpha
+                    return cp.sum(
+                        2.0 * (_y_c * cp.log(_y_c / _mu_c)
+                               - (_y_c + 1.0 / _a) * cp.log((1.0 + _a * _y_c) / (1.0 + _a * _mu_c)))
+                    )
+                elif _fname == "tweedie":
+                    p = _tweedie_power
+                    if abs(p - 1.0) < 0.01:
+                        return cp.sum(mu_arr - _y * cp.log(mu_arr))
+                    elif abs(p - 2.0) < 0.01:
+                        return cp.sum(_y / mu_arr - cp.log(_y / mu_arr) - 1.0)
+                    else:
+                        return cp.sum(
+                            _y * (cp.power(_y, 1.0 - p) - cp.power(mu_arr, 1.0 - p)) / (1.0 - p)
+                            - (cp.power(_y, 2.0 - p) - cp.power(mu_arr, 2.0 - p)) / (2.0 - p)
+                        )
+                else:
+                    return cp.sum(mu_arr - _y * cp.log(mu_arr))
+            else:
+                if _fname in ("gaussian", "squared_error"):
+                    return float(np.sum((_y - mu_arr) ** 2))
+                elif _fname == "gamma":
+                    return float(np.sum(_y / mu_arr - np.log(_y / mu_arr) - 1.0))
+                elif _fname == "inverse_gaussian":
+                    return float(np.sum((_y - mu_arr) ** 2 / (_y * mu_arr ** 2)))
+                elif _fname == "negative_binomial":
+                    _mu_c = np.clip(mu_arr, 1e-10, None)
+                    _y_c = np.clip(_y, 1e-10, None)
+                    _a = _nb_alpha
+                    return float(np.sum(
+                        2.0 * (_y_c * np.log(_y_c / _mu_c)
+                               - (_y_c + 1.0 / _a) * np.log((1.0 + _a * _y_c) / (1.0 + _a * _mu_c)))
+                    ))
+                elif _fname == "tweedie":
+                    p = _tweedie_power
+                    if abs(p - 1.0) < 0.01:
+                        return float(np.sum(mu_arr - _y * np.log(mu_arr)))
+                    elif abs(p - 2.0) < 0.01:
+                        return float(np.sum(_y / mu_arr - np.log(_y / mu_arr) - 1.0))
+                    else:
+                        return float(np.sum(
+                            _y * (np.power(_y, 1.0 - p) - np.power(mu_arr, 1.0 - p)) / (1.0 - p)
+                            - (np.power(_y, 2.0 - p) - np.power(mu_arr, 2.0 - p)) / (2.0 - p)
+                        ))
+                else:
+                    return float(np.sum(mu_arr - _y * np.log(mu_arr)))
+        # Current loss — reuse eta_raw computed at top of iteration
+        # (params have not been updated yet, so X @ params_old == eta_raw).
+        # Use eta (clipped for non-identity links) for mu computation.
+        mu_cur = family.link.inverse(eta)
+        try:
+            dev_old_dev = _dev_val(mu_cur)
+        except Exception:
+            dev_old_dev = float('inf')
+        # Line search: for families with constant IRLS weights (Gaussian,
+        # Gamma, InverseGaussian), the IRLS step IS the Newton step on the
+        # GLM loss, and the Hessian is constant X'X/n.  Accept full step.
+        # For variable-weight families (Poisson, Logistic, Tweedie),
+        # use Armijo backtracking on the deviance.
+        _direction = params_new - params_old
+        _is_constant_W = _fname in ("gamma", "gaussian", "squared_error")
+        # Convert dev_old to Python float for tolerance computation
+        # (single sync per iteration, not per line-search step)
+        if backend == "torch":
+            dev_old_f = float(dev_old_dev.item())
+        elif backend == "cupy":
+            dev_old_f = float(dev_old_dev)
+        else:
+            dev_old_f = float(dev_old_dev)
+        _dev_tol = max(abs(dev_old_f) * 1e-10, 1e-6)
+        def _dev_accept(dev_try_dev):
+            """Check if trial deviance is acceptable (device-side NaN + comparison)."""
+            if backend == "torch":
+                import torch
+                if torch.isnan(dev_try_dev):
+                    return False
+                return bool((dev_try_dev <= dev_old_dev + _dev_tol).item())
+            elif backend == "cupy":
+                import cupy as cp
+                if cp.isnan(dev_try_dev):
+                    return False
+                return bool(dev_try_dev <= dev_old_dev + _dev_tol)
+            else:
+                if dev_try_dev != dev_try_dev:
+                    return False
+                return dev_try_dev <= dev_old_f + _dev_tol
+        if _is_constant_W:
+            # Constant weights: IRLS = Newton.  Try full step first;
+            # if deviance increases significantly, fall back to Armijo.
+            eta_new = _clip(X @ params_new, -30, 30, backend)
+            mu_new = family.link.inverse(eta_new)
+            try:
+                dev_new_dev = _dev_val(mu_new)
+            except Exception:
+                dev_new_dev = float('inf')
+            if _dev_accept(dev_new_dev):
+                params = params_new
+            else:
+                step = 1.0
+                _accepted = False
+                for _bt in range(30):
+                    params_try = params_old + step * _direction
+                    eta_try = _clip(X @ params_try, -30, 30, backend)
+                    mu_try = family.link.inverse(eta_try)
+                    try:
+                        dev_try_dev = _dev_val(mu_try)
+                    except Exception:
+                        step *= 0.5
+                        continue
+                    if _dev_accept(dev_try_dev):
+                        _accepted = True
+                        break
+                    step *= 0.5
+                params = params_try if _accepted else params_old + 0.1 * _direction
+        else:
+            # Variable weights: Armijo backtracking on deviance
+            step = 1.0
+            _accepted = False
+            for _bt in range(30):
+                params_try = params_old + step * _direction
+                eta_try = _clip(X @ params_try, -30, 30, backend)
+                mu_try = family.link.inverse(eta_try)
+                try:
+                    dev_try_dev = _dev_val(mu_try)
+                except Exception:
+                    step *= 0.5
+                    continue
+                if _dev_accept(dev_try_dev):
+                    _accepted = True
+                    break
+                step *= 0.5
+            if _accepted:
+                params = params_try
+            else:
+                params = params_old + 0.1 * _direction
+        # Convergence: gradient norm check (most reliable for all families)
+        if iteration % 5 == 4 or iteration == max_iter - 1:
+            try:
+                grad_f = family.gradient(X, y, params)
+                if ridge_alpha > 0:
+                    grad_f[1:] = grad_f[1:] + (ridge_alpha / X.shape[0]) * params[1:]
+                grad_norm = float(_norm(grad_f, backend))
+            except Exception:
+                # No gradient method available — fall back to param change
+                _param_change = float(_norm(params - params_old, backend))
+                _param_norm = max(float(_norm(params, backend)), 1.0)
+                grad_norm = _param_change / _param_norm  # relative change
+            if grad_norm < tol:
+                break
+    n_iter = iteration + 1
+    if n_iter >= max_iter:
+        from statgpu.solvers._convergence import ConvergenceWarning
+        warnings.warn(
+            f"irls did not converge within {max_iter} iterations "
+            f"(family={getattr(family, 'name', '?')}).",
+            ConvergenceWarning,
+            stacklevel=2,
+        )
+    return params, n_iter
+class IRLSSolver:
+    """Unified IRLS solver: each iteration solves weighted least squares.
+    Supports numpy / cupy / torch backends (auto-detect X type).
+    """
+    def __init__(self, family, max_iter=100, tol=1e-4):
+        self.family = family
+        self.max_iter = max_iter
+        self.tol = tol
+    def fit(
+        self,
+        X,
+        y,
+        init_coef=None,
+        sample_weight=None,
+        ridge_alpha=0.0,
+        ridge_penalize_intercept=False,
+        backend="auto",
+        penalty_matrix=None,
+    ):
+        """Run IRLS loop.
+        Parameters
+        ----------
+        ridge_alpha : float
+            L2 regularization (lambda = 1/(2*C) format).
+        ridge_penalize_intercept : bool
+            Whether to penalize the intercept.
+        penalty_matrix : array, optional
+            Additional penalty matrix for the normal equations.
+        """
+        return irls_solver(
+            self.family,
+            X,
+            y,
+            max_iter=self.max_iter,
+            tol=self.tol,
+            init_coef=init_coef,
+            sample_weight=sample_weight,
+            ridge_alpha=ridge_alpha,
+            ridge_penalize_intercept=ridge_penalize_intercept,
+            backend=backend,
+            penalty_matrix=penalty_matrix,
+        )

statgpu/glm_core/_logistic.py ADDED Viewed

@@ -0,0 +1,82 @@
+"""
+Logistic loss: negative Bernoulli log-likelihood.
+For binary classification:
+    loss = (1/n) * sum(-y*z + log(1 + exp(z)))
+where z = X @ coef.
+Supports numpy / cupy / torch backends via _backend helpers.
+"""
+from statgpu.backends._array_ops import _clip, _log1p, _exp, _sigmoid, _sum, _max_eigval_power
+from statgpu.glm_core._base import GLMLoss, register_glm_loss
+@register_glm_loss('logistic')
+class LogisticLoss(GLMLoss):
+    name = "logistic"
+    y_type = "binary"
+    smooth_gradient = True
+    has_hessian = True
+    _lipschitz_safety = 1.5
+    _lipschitz_safety_cv = 2.0
+    _prefer_fista_over_bb = True
+    _gpu_loop_excluded = True
+    _conservative_momentum_with_nonsmooth = True
+    # ── Per-sample formulas (single source of truth) ──────────────────
+    def per_sample_value(self, eta, y):
+        """Negative Bernoulli log-likelihood per sample."""
+        from statgpu.backends._array_ops import _xp
+        xp = _xp(eta)
+        if xp.__name__ == "torch":
+            max_eta = xp.clamp(eta, min=0)
+        else:
+            max_eta = xp.maximum(eta, 0)
+        log1pexp = _log1p(_exp(-xp.abs(eta))) + max_eta
+        return -y * eta + log1pexp
+    def per_sample_gradient(self, eta, y):
+        return _sigmoid(eta) - y
+    # ── Hessian / Lipschitz (override for weighted support) ───────────
+    def hessian(self, X, y, coef, sample_weight=None):
+        z = X @ coef
+        p = _sigmoid(z)
+        W = _clip(p * (1.0 - p), 1e-10, 1.0 - 1e-10)
+        if sample_weight is not None:
+            W = W * sample_weight
+            return X.T @ (X * W[:, None]) / sample_weight.sum()
+        return X.T @ (X * W[:, None]) / X.shape[0]
+    def lipschitz(self, X, coef, y=None, sample_weight=None):
+        # Global bound: L_global = lambda_max(X'X) / (4n)
+        n_eff = float(sample_weight.sum()) if sample_weight is not None else X.shape[0]
+        if sample_weight is not None:
+            sw = sample_weight[:, None] if hasattr(sample_weight, '__len__') else sample_weight
+            XtWX = X.T @ (X * sw)
+            L_global = _max_eigval_power(XtWX) / (4.0 * n_eff)
+        else:
+            XtX = X.T @ X
+            L_global = _max_eigval_power(XtX) / (4.0 * n_eff)
+        if coef is not None:
+            z = X @ coef
+            p = _sigmoid(z)
+            W = _clip(p * (1.0 - p), 1e-10, 0.25)
+            if sample_weight is not None:
+                W = W * (sample_weight if sample_weight.ndim == 1 else sample_weight.ravel())
+            XtWX = X.T @ (X * W[:, None])
+            L_iter = _max_eigval_power(XtWX) / n_eff
+            # Floor at 10% of global bound to prevent overshoot near optimum
+            return max(L_iter, L_global * 0.1)
+        return L_global
+    def predict(self, X, coef):
+        z = X @ coef
+        p = _sigmoid(z)
+        if hasattr(p, 'numpy'):
+            return (p > 0.5).cpu().numpy()
+        elif hasattr(p, 'get'):
+            return (p > 0.5).get()
+        return p > 0.5