PyPI - statgpu - Versions diffs - 0.1.0__py3-none-any.whl - Mend

statgpu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (168) hide show

statgpu/__init__.py +174 -0
statgpu/_base.py +544 -0
statgpu/_config.py +127 -0
statgpu/anova/__init__.py +5 -0
statgpu/anova/_oneway.py +194 -0
statgpu/backends/__init__.py +83 -0
statgpu/backends/_array_ops.py +529 -0
statgpu/backends/_base.py +184 -0
statgpu/backends/_cupy.py +453 -0
statgpu/backends/_factory.py +65 -0
statgpu/backends/_gpu_inference_cupy.py +214 -0
statgpu/backends/_gpu_inference_torch.py +422 -0
statgpu/backends/_numpy.py +324 -0
statgpu/backends/_torch.py +685 -0
statgpu/backends/_torch_safe.py +47 -0
statgpu/backends/_utils.py +423 -0
statgpu/core/__init__.py +10 -0
statgpu/core/formula/__init__.py +33 -0
statgpu/core/formula/_design.py +99 -0
statgpu/core/formula/_parser.py +191 -0
statgpu/core/formula/_terms.py +70 -0
statgpu/core/formula/tests/__init__.py +0 -0
statgpu/core/formula/tests/test_parser.py +194 -0
statgpu/covariance/__init__.py +6 -0
statgpu/covariance/_empirical.py +310 -0
statgpu/covariance/_shrinkage.py +248 -0
statgpu/cross_validation/__init__.py +31 -0
statgpu/cross_validation/_base.py +410 -0
statgpu/cross_validation/_engine.py +167 -0
statgpu/diagnostics/__init__.py +7 -0
statgpu/diagnostics/_regression_diagnostics.py +188 -0
statgpu/feature_selection/__init__.py +24 -0
statgpu/feature_selection/_knockoff.py +870 -0
statgpu/feature_selection/_knockoff_utils.py +1003 -0
statgpu/feature_selection/_stepwise.py +300 -0
statgpu/glm_core/__init__.py +81 -0
statgpu/glm_core/_base.py +202 -0
statgpu/glm_core/_family.py +362 -0
statgpu/glm_core/_fused.py +149 -0
statgpu/glm_core/_gamma.py +111 -0
statgpu/glm_core/_inverse_gaussian.py +62 -0
statgpu/glm_core/_irls.py +561 -0
statgpu/glm_core/_logistic.py +82 -0
statgpu/glm_core/_negative_binomial.py +68 -0
statgpu/glm_core/_poisson.py +60 -0
statgpu/glm_core/_solver_legacy.py +100 -0
statgpu/glm_core/_squared.py +53 -0
statgpu/glm_core/_tweedie.py +74 -0
statgpu/inference/__init__.py +239 -0
statgpu/inference/_distributions_backend.py +2610 -0
statgpu/inference/_multiple_testing.py +391 -0
statgpu/inference/_resampling.py +1400 -0
statgpu/inference/_results.py +265 -0
statgpu/linear_model/__init__.py +75 -0
statgpu/linear_model/_gaussian_inference.py +306 -0
statgpu/linear_model/_glm_base.py +1261 -0
statgpu/linear_model/_ordered_logit.py +52 -0
statgpu/linear_model/_ordered_probit.py +50 -0
statgpu/linear_model/_stats.py +170 -0
statgpu/linear_model/cv/__init__.py +13 -0
statgpu/linear_model/cv/_elasticnet_cv.py +892 -0
statgpu/linear_model/cv/_lasso_cv.py +253 -0
statgpu/linear_model/cv/_logistic_cv.py +895 -0
statgpu/linear_model/cv/_ridge_cv.py +1160 -0
statgpu/linear_model/legacy/__init__.py +1 -0
statgpu/linear_model/legacy/_distributions_legacy_gpu.py +340 -0
statgpu/linear_model/legacy/_elasticnet_legacy.py +936 -0
statgpu/linear_model/legacy/_lasso_legacy.py +4876 -0
statgpu/linear_model/legacy/_penalized_legacy.py +1174 -0
statgpu/linear_model/legacy/_ridge_legacy.py +863 -0
statgpu/linear_model/legacy/_solver_legacy.py +104 -0
statgpu/linear_model/penalized/__init__.py +25 -0
statgpu/linear_model/penalized/_base.py +437 -0
statgpu/linear_model/penalized/_fit_mixin.py +1877 -0
statgpu/linear_model/penalized/_inference_mixin.py +1179 -0
statgpu/linear_model/penalized/_penalized_cv.py +2699 -0
statgpu/linear_model/penalized/_penalized_gamma.py +86 -0
statgpu/linear_model/penalized/_penalized_inverse_gaussian.py +62 -0
statgpu/linear_model/penalized/_penalized_linear.py +236 -0
statgpu/linear_model/penalized/_penalized_logistic.py +100 -0
statgpu/linear_model/penalized/_penalized_negative_binomial.py +65 -0
statgpu/linear_model/penalized/_penalized_poisson.py +62 -0
statgpu/linear_model/penalized/_penalized_tweedie.py +65 -0
statgpu/linear_model/penalized/_predict_mixin.py +182 -0
statgpu/linear_model/wrappers/__init__.py +31 -0
statgpu/linear_model/wrappers/_adaptive_lasso.py +63 -0
statgpu/linear_model/wrappers/_elasticnet.py +75 -0
statgpu/linear_model/wrappers/_gamma.py +67 -0
statgpu/linear_model/wrappers/_inverse_gaussian.py +47 -0
statgpu/linear_model/wrappers/_lasso.py +2124 -0
statgpu/linear_model/wrappers/_linear.py +1127 -0
statgpu/linear_model/wrappers/_logistic.py +1435 -0
statgpu/linear_model/wrappers/_mcp.py +58 -0
statgpu/linear_model/wrappers/_negative_binomial.py +58 -0
statgpu/linear_model/wrappers/_poisson.py +48 -0
statgpu/linear_model/wrappers/_ridge.py +166 -0
statgpu/linear_model/wrappers/_scad.py +58 -0
statgpu/linear_model/wrappers/_tweedie.py +57 -0
statgpu/metrics/__init__.py +21 -0
statgpu/metrics/_classification.py +591 -0
statgpu/nonparametric/__init__.py +50 -0
statgpu/nonparametric/kernel_methods/__init__.py +25 -0
statgpu/nonparametric/kernel_methods/_kernels.py +246 -0
statgpu/nonparametric/kernel_methods/_krr.py +234 -0
statgpu/nonparametric/kernel_methods/_krr_cv.py +380 -0
statgpu/nonparametric/kernel_smoothing/__init__.py +39 -0
statgpu/nonparametric/kernel_smoothing/_bandwidth_selection.py +1083 -0
statgpu/nonparametric/kernel_smoothing/_kde.py +761 -0
statgpu/nonparametric/kernel_smoothing/_kernel_common.py +348 -0
statgpu/nonparametric/kernel_smoothing/_kernel_regression.py +748 -0
statgpu/nonparametric/splines/__init__.py +5 -0
statgpu/nonparametric/splines/_bspline_basis.py +336 -0
statgpu/nonparametric/splines/_penalized.py +349 -0
statgpu/panel/__init__.py +19 -0
statgpu/panel/_covariance.py +140 -0
statgpu/panel/_fixed_effects.py +420 -0
statgpu/panel/_random_effects.py +385 -0
statgpu/panel/_utils.py +482 -0
statgpu/penalties/__init__.py +139 -0
statgpu/penalties/_adaptive_l1.py +313 -0
statgpu/penalties/_base.py +261 -0
statgpu/penalties/_categories.py +39 -0
statgpu/penalties/_elasticnet.py +98 -0
statgpu/penalties/_group_lasso.py +678 -0
statgpu/penalties/_group_mcp.py +553 -0
statgpu/penalties/_group_scad.py +605 -0
statgpu/penalties/_l1.py +107 -0
statgpu/penalties/_l2.py +77 -0
statgpu/penalties/_mcp.py +237 -0
statgpu/penalties/_scad.py +260 -0
statgpu/semiparametric/__init__.py +5 -0
statgpu/semiparametric/_gam.py +401 -0
statgpu/solvers/__init__.py +24 -0
statgpu/solvers/_admm.py +241 -0
statgpu/solvers/_constants.py +15 -0
statgpu/solvers/_convergence.py +6 -0
statgpu/solvers/_fista.py +436 -0
statgpu/solvers/_fista_bb.py +513 -0
statgpu/solvers/_fista_lla.py +541 -0
statgpu/solvers/_lbfgs.py +206 -0
statgpu/solvers/_newton.py +149 -0
statgpu/solvers/_utils.py +277 -0
statgpu/survival/__init__.py +14 -0
statgpu/survival/_cox.py +3974 -0
statgpu/survival/_cox_breslow_triton_kernel.py +106 -0
statgpu/survival/_cox_cv.py +1159 -0
statgpu/survival/_cox_efron_cuda.py +1280 -0
statgpu/survival/_cox_efron_triton.py +359 -0
statgpu/unsupervised/__init__.py +29 -0
statgpu/unsupervised/_agglomerative.py +307 -0
statgpu/unsupervised/_dbscan.py +263 -0
statgpu/unsupervised/_dbscan_cpu.pyx +125 -0
statgpu/unsupervised/_gmm.py +332 -0
statgpu/unsupervised/_incremental_pca.py +176 -0
statgpu/unsupervised/_kmeans.py +261 -0
statgpu/unsupervised/_minibatch_kmeans.py +299 -0
statgpu/unsupervised/_minibatch_nmf.py +252 -0
statgpu/unsupervised/_nmf.py +190 -0
statgpu/unsupervised/_pca.py +189 -0
statgpu/unsupervised/_truncated_svd.py +132 -0
statgpu/unsupervised/_tsne.py +192 -0
statgpu/unsupervised/_umap.py +224 -0
statgpu/unsupervised/_utils.py +134 -0
statgpu-0.1.0.dist-info/METADATA +245 -0
statgpu-0.1.0.dist-info/RECORD +168 -0
statgpu-0.1.0.dist-info/WHEEL +5 -0
statgpu-0.1.0.dist-info/licenses/LICENSE +199 -0
statgpu-0.1.0.dist-info/top_level.txt +1 -0

statgpu/survival/_cox_efron_triton.py ADDED Viewed

@@ -0,0 +1,359 @@
+"""
+Triton JIT kernel for Cox PH Efron backward gradient/Hessian.
+Mirrors the algorithm in `_cox_efron_cuda.py` (CuPy RawKernel serial version).
+Design:
+- Single Triton program (grid=(1,)) executes the entire backward scan.
+- P (feature dim) is constexpr, enabling loop unrolling for small p.
+- Local scalar accumulators where possible; workspace tensor for p*p matrices.
+"""
+from __future__ import annotations
+import os
+from typing import Any, List, Optional, Tuple
+import numpy as np
+def _import_triton():
+    """Deferred Triton import."""
+    try:
+        import triton
+        import triton.language as tl
+        return triton, tl
+    except ImportError:
+        return None, None
+_triton, _tl = _import_triton()
+HAS_TRITON_EFRON: bool = False
+HAS_TRITON_BRESLOW: bool = False
+if _triton is not None and _tl is not None:
+    try:
+        import triton
+        import triton.language as tl
+        @triton.jit
+        def _efron_backward_scan_serial(
+            # Input tensors
+            X_ptr,            # [n, p] float64
+            e_eta_ptr,        # [n] float64
+            enter_ptr_ptr,    # [nuft+1] int32
+            enter_ind_ptr,    # [n_enter_total] int32
+            exit_ptr_ptr,     # [nuft+1] int32
+            exit_ind_ptr,     # [n_exit_total] int32
+            fail_ptr_ptr,     # [nuft+1] int32
+            fail_ind_ptr,     # [n_fail_total] int32
+            # Workspace (caller-allocated, zeroed)
+            ws_ptr,           # [workspace_size] float64
+            # Output (caller-allocated, zeroed)
+            grad_ptr,         # [p] float64
+            hess_ptr,         # [p*p] float64
+            # Parameters
+            n,
+            p,
+            nuft,
+            # Compile-time constants
+            P: tl.constexpr,
+        ):
+            """Single-program serial Efron backward scan kernel."""
+            # Workspace layout (all offsets relative to ws_ptr):
+            WS_XP0    = 0
+            WS_XP1    = 1
+            WS_XP2    = 1 + P
+            WS_HESS   = 1 + P + P * P
+            WS_XP1F   = 1 + 2 * P * P
+            WS_XP2F   = 1 + 2 * P * P + P
+            WS_SCRATCH = 1 + 3 * P * P + P
+            WS_SIZE   = 1 + 3 * P * P + P + 1
+            # ws_ptr is already zeroed by caller.
+            # ---- Backward scan ----
+            for ii in range(nuft - 1, -1, -1):
+                # ---- Enter phase ----
+                e0 = tl.load(enter_ptr_ptr + ii)
+                e1 = tl.load(enter_ptr_ptr + ii + 1)
+                nt = e1 - e0
+                if nt > 0:
+                    for t in range(0, nt, 1):
+                        idx = tl.load(enter_ind_ptr + e0 + t)
+                        row_off = idx * p
+                        elx = tl.load(e_eta_ptr + idx)
+                        # xp0 += elx
+                        old = tl.load(ws_ptr + WS_XP0)
+                        tl.store(ws_ptr + WS_XP0, old + elx)
+                        # xp1[j] += elx * X[idx,j]
+                        for j in range(0, P, 1):
+                            if j < p:
+                                xval = tl.load(X_ptr + row_off + j)
+                                old = tl.load(ws_ptr + WS_XP1 + j)
+                                tl.store(ws_ptr + WS_XP1 + j, old + elx * xval)
+                        # xp2[j*P+k] += elx * X[idx,j] * X[idx,k]
+                        for j in range(0, P, 1):
+                            if j < p:
+                                vj = tl.load(X_ptr + row_off + j)
+                                for k in range(0, P, 1):
+                                    if k < p:
+                                        vk = tl.load(X_ptr + row_off + k)
+                                        old = tl.load(ws_ptr + WS_XP2 + j * P + k)
+                                        tl.store(ws_ptr + WS_XP2 + j * P + k, old + elx * vj * vk)
+                # ---- Fail phase ----
+                f0 = tl.load(fail_ptr_ptr + ii)
+                f1 = tl.load(fail_ptr_ptr + ii + 1)
+                m = f1 - f0
+                if m > 0:
+                    # Zero xp1f and xp2f in workspace
+                    for j in range(0, P, 1):
+                        if j < p:
+                            tl.store(ws_ptr + WS_XP1F + j, 0.0)
+                    for j in range(0, P, 1):
+                        if j < p:
+                            for k in range(0, P, 1):
+                                if k < p:
+                                    tl.store(ws_ptr + WS_XP2F + j * P + k, 0.0)
+                    # Accumulate fail sums into xp1f, xp2f, xp0f
+                    xp0f_acc = 0.0
+                    for t in range(0, m, 1):
+                        idx = tl.load(fail_ind_ptr + f0 + t)
+                        row_off = idx * p
+                        elx = tl.load(e_eta_ptr + idx)
+                        xp0f_acc = xp0f_acc + elx
+                        # grad[j] += X[idx,j]
+                        for j in range(0, P, 1):
+                            if j < p:
+                                vj = tl.load(X_ptr + row_off + j)
+                                old = tl.load(grad_ptr + j)
+                                tl.store(grad_ptr + j, old + vj)
+                        # xp1f[j] += elx * X[idx,j]
+                        for j in range(0, P, 1):
+                            if j < p:
+                                vj = tl.load(X_ptr + row_off + j)
+                                old = tl.load(ws_ptr + WS_XP1F + j)
+                                tl.store(ws_ptr + WS_XP1F + j, old + elx * vj)
+                        # xp2f[j*P+k] += elx * X[idx,j] * X[idx,k]
+                        for j in range(0, P, 1):
+                            if j < p:
+                                vj = tl.load(X_ptr + row_off + j)
+                                for k in range(0, P, 1):
+                                    if k < p:
+                                        vk = tl.load(X_ptr + row_off + k)
+                                        old = tl.load(ws_ptr + WS_XP2F + j * P + k)
+                                        tl.store(ws_ptr + WS_XP2F + j * P + k, old + elx * vj * vk)
+                    # Efron correction (serial)
+                    xp0v = tl.load(ws_ptr + WS_XP0)
+                    sum_inv_c0 = 0.0
+                    sum_J_c0 = 0.0
+                    sum_aa = 0.0
+                    sum_bb = 0.0
+                    sum_ab = 0.0
+                    for kk in range(0, m, 1):
+                        Jk = (kk * 1.0) / (m * 1.0)
+                        c0 = xp0v - Jk * xp0f_acc
+                        if c0 < 1e-300:
+                            c0 = 1e-300
+                        ak = 1.0 / c0
+                        bk = Jk * ak
+                        sum_inv_c0 = sum_inv_c0 + ak
+                        sum_J_c0 = sum_J_c0 + Jk / c0
+                        sum_aa = sum_aa + ak * ak
+                        sum_bb = sum_bb + bk * bk
+                        sum_ab = sum_ab + ak * bk
+                    # Apply to grad
+                    for j in range(0, P, 1):
+                        if j < p:
+                            xp1j = tl.load(ws_ptr + WS_XP1 + j)
+                            xp1fj = tl.load(ws_ptr + WS_XP1F + j)
+                            old = tl.load(grad_ptr + j)
+                            tl.store(grad_ptr + j, old - (xp1j * sum_inv_c0 - xp1fj * sum_J_c0))
+                    # Apply to hess
+                    for j in range(0, P, 1):
+                        if j < p:
+                            for k in range(0, P, 1):
+                                if k < p:
+                                    xp2jk = tl.load(ws_ptr + WS_XP2 + j * P + k)
+                                    xp2fjk = tl.load(ws_ptr + WS_XP2F + j * P + k)
+                                    hess_val = xp2jk * sum_inv_c0 - xp2fjk * sum_J_c0
+                                    xp1j_v = tl.load(ws_ptr + WS_XP1 + j)
+                                    xp1k_v = tl.load(ws_ptr + WS_XP1 + k)
+                                    xp1fj_v = tl.load(ws_ptr + WS_XP1F + j)
+                                    xp1fk_v = tl.load(ws_ptr + WS_XP1F + k)
+                                    o11 = xp1j_v * xp1k_v
+                                    off_v = xp1fj_v * xp1fk_v
+                                    cross_v = xp1j_v * xp1fk_v + xp1fj_v * xp1k_v
+                                    hsub = sum_aa * o11 + sum_bb * off_v - sum_ab * cross_v
+                                    hess_val = hess_val - hsub
+                                    idx2 = j * P + k
+                                    old = tl.load(hess_ptr + idx2)
+                                    tl.store(hess_ptr + idx2, hess_val + old)
+                # ---- Exit phase ----
+                x0 = tl.load(exit_ptr_ptr + ii)
+                x1 = tl.load(exit_ptr_ptr + ii + 1)
+                nx = x1 - x0
+                if nx > 0:
+                    for t in range(0, nx, 1):
+                        idx = tl.load(exit_ind_ptr + x0 + t)
+                        row_off = idx * p
+                        elx = tl.load(e_eta_ptr + idx)
+                        # xp0 -= elx
+                        old = tl.load(ws_ptr + WS_XP0)
+                        tl.store(ws_ptr + WS_XP0, old - elx)
+                        # xp1[j] -= elx * X[idx,j]
+                        for j in range(0, P, 1):
+                            if j < p:
+                                xval = tl.load(X_ptr + row_off + j)
+                                old = tl.load(ws_ptr + WS_XP1 + j)
+                                tl.store(ws_ptr + WS_XP1 + j, old - elx * xval)
+                        # xp2 -= elx * X^T X
+                        for j in range(0, P, 1):
+                            if j < p:
+                                vj = tl.load(X_ptr + row_off + j)
+                                for k in range(0, P, 1):
+                                    if k < p:
+                                        vk = tl.load(X_ptr + row_off + k)
+                                        old = tl.load(ws_ptr + WS_XP2 + j * P + k)
+                                        tl.store(ws_ptr + WS_XP2 + j * P + k, old - elx * vj * vk)
+        HAS_TRITON_EFRON = True
+    except Exception:
+        HAS_TRITON_EFRON = False
+        _triton = None
+        _tl = None
+    # =====================================================================
+    # Breslow Hessian — PyTorch GPU path (cuBLAS matmul + vectorized ops)
+    # =====================================================================
+    # Originally attempted a Triton serial-scan kernel, but Triton 2.0 has a
+    # compiler bug producing non-deterministic wrong code for kernels with
+    # runtime-bounded loops (while/for with >= 3 iterations). The PyTorch
+    # approach is only marginally slower since each op is cuBLAS-optimized.
+    try:
+        from statgpu.survival._cox_breslow_triton_kernel import (
+            compute_breslow_grad_hess_triton,
+            _find_p_ce as _find_p_ce_breslow,
+        )
+        HAS_TRITON_BRESLOW = True
+    except Exception:
+        compute_breslow_grad_hess_triton = None
+        HAS_TRITON_BRESLOW = False
+def _triton_available() -> bool:
+    return HAS_TRITON_EFRON
+_SUPPORTED_P: Tuple[int, ...] = (8, 16, 32, 64, 128)
+def _find_p_ce(p: int) -> Optional[int]:
+    for sp in _SUPPORTED_P:
+        if sp >= p:
+            return sp
+    return None
+def compute_efron_grad_hess_triton(
+    X: Any,
+    beta: Any,
+    efron_pre: Any,
+) -> Optional[Tuple[Any, Any]]:
+    """Compute Efron gradient/Hessian via Triton serial kernel."""
+    if not HAS_TRITON_EFRON:
+        return None
+    import torch
+    from statgpu.survival._cox_efron_cuda import (
+        efron_indices_to_csr,
+        _pick_backward_launch_params,
+    )
+    if len(efron_pre) == 6:
+        _, uft_ix, risk_enter, risk_exit, nuft, _ = efron_pre
+    else:
+        _, uft_ix, risk_enter, risk_exit, nuft = efron_pre
+    p = int(X.shape[1])
+    p_ce = _find_p_ce(p)
+    if p_ce is None:
+        return None
+    if nuft == 0:
+        return (
+            torch.zeros(p, dtype=torch.float64, device=X.device),
+            torch.zeros((p, p), dtype=torch.float64, device=X.device),
+        )
+    n = int(X.shape[0])
+    device = X.device
+    # Build linear predictor
+    linpred = X @ beta
+    linpred = linpred - torch.max(linpred)
+    e_eta = torch.exp(linpred)
+    # Build CSR
+    enter_ptr, enter_ind, exit_ptr, exit_ind, fail_ptr, fail_ind = efron_indices_to_csr(
+        uft_ix, risk_enter, risk_exit, nuft
+    )
+    enter_ptr_t = torch.as_tensor(enter_ptr, dtype=torch.int32, device=device)
+    enter_ind_t = torch.as_tensor(enter_ind, dtype=torch.int32, device=device)
+    exit_ptr_t = torch.as_tensor(exit_ptr, dtype=torch.int32, device=device)
+    exit_ind_t = torch.as_tensor(exit_ind, dtype=torch.int32, device=device)
+    fail_ptr_t = torch.as_tensor(fail_ptr, dtype=torch.int32, device=device)
+    fail_ind_t = torch.as_tensor(fail_ind, dtype=torch.int32, device=device)
+    seq_thresh, _ = _pick_backward_launch_params(p, nuft, n)
+    # Workspace: WS_XP0(1) + WS_XP1(P) + WS_XP2(P*P) + WS_HESS(P*P) +
+    #             WS_XP1F(P) + WS_XP2F(P*P) + WS_SCRATCH(1)
+    ws_size = 1 + 3 * p_ce + 3 * p_ce * p_ce + 1
+    ws = torch.zeros(ws_size, dtype=torch.float64, device=device)
+    grad_out = torch.zeros(p, dtype=torch.float64, device=device)
+    # Allocate hess_out with padded stride (p_ce) to match Triton kernel indexing
+    hess_out = torch.zeros(p_ce * p_ce, dtype=torch.float64, device=device)
+    try:
+        _efron_backward_scan_serial[(1,)](
+            X, e_eta,
+            enter_ptr_t, enter_ind_t,
+            exit_ptr_t, exit_ind_t,
+            fail_ptr_t, fail_ind_t,
+            ws, grad_out, hess_out,
+            n, p, nuft,
+            P=p_ce,
+        )
+        torch.cuda.synchronize()
+    except Exception:
+        return None
+    return grad_out, -hess_out.view(p_ce, p_ce)[:p, :p]
+# compute_breslow_grad_hess_triton and _find_p_ce are imported from
+# _cox_breslow_triton_kernel.py above (in the try/except block).

statgpu/unsupervised/__init__.py ADDED Viewed

@@ -0,0 +1,29 @@
+"""Unsupervised learning estimators."""
+from ._pca import PCA
+from ._kmeans import KMeans
+from ._dbscan import DBSCAN
+from ._gmm import GaussianMixture
+from ._nmf import NMF
+from ._agglomerative import AgglomerativeClustering
+from ._truncated_svd import TruncatedSVD
+from ._minibatch_kmeans import MiniBatchKMeans
+from ._incremental_pca import IncrementalPCA
+from ._minibatch_nmf import MiniBatchNMF
+from ._umap import UMAP
+from ._tsne import TSNE
+__all__ = [
+    "PCA",
+    "KMeans",
+    "DBSCAN",
+    "GaussianMixture",
+    "NMF",
+    "AgglomerativeClustering",
+    "TruncatedSVD",
+    "MiniBatchKMeans",
+    "IncrementalPCA",
+    "MiniBatchNMF",
+    "UMAP",
+    "TSNE",
+]

statgpu/unsupervised/_agglomerative.py ADDED Viewed

@@ -0,0 +1,307 @@
+"""Agglomerative clustering."""
+from __future__ import annotations
+import os
+import warnings
+from typing import Optional, Union
+import numpy as np
+from scipy.cluster.hierarchy import fcluster, linkage
+from statgpu._base import BaseEstimator
+from statgpu._config import Device
+from statgpu.unsupervised._utils import check_2d_array, reject_sparse, squared_euclidean_distances
+DEFAULT_GPU_DISTANCE_LIMIT_BYTES = 1 << 30
+def _gpu_distance_limit_bytes() -> int:
+    value = os.environ.get("STATGPU_AGGLOMERATIVE_GPU_MAX_BYTES")
+    if value is None:
+        return DEFAULT_GPU_DISTANCE_LIMIT_BYTES
+    try:
+        return int(value)
+    except ValueError:
+        warnings.warn(
+            "Invalid STATGPU_AGGLOMERATIVE_GPU_MAX_BYTES value; "
+            f"using default {DEFAULT_GPU_DISTANCE_LIMIT_BYTES} bytes.",
+            RuntimeWarning,
+            stacklevel=2,
+        )
+        return DEFAULT_GPU_DISTANCE_LIMIT_BYTES
+class AgglomerativeClustering(BaseEstimator):
+    """Exact dense agglomerative clustering."""
+    _GPU_DISTANCE_LIMIT_BYTES = _gpu_distance_limit_bytes()
+    def __init__(
+        self,
+        n_clusters: int = 2,
+        linkage: str = "single",
+        metric: str = "euclidean",
+        device: Union[str, Device] = Device.AUTO,
+        n_jobs: Optional[int] = None,
+    ):
+        super().__init__(device=device, n_jobs=n_jobs)
+        self.n_clusters = n_clusters
+        self.linkage = linkage
+        self.metric = metric
+    def _validate_params(self, n_samples: int):
+        if not isinstance(self.n_clusters, (int, np.integer)) or int(self.n_clusters) < 1:
+            raise ValueError("n_clusters must be a positive integer")
+        if int(self.n_clusters) > n_samples:
+            raise ValueError("n_clusters must be less than or equal to n_samples")
+        if self.linkage not in ("single", "complete", "average", "ward"):
+            raise ValueError("linkage must be one of: 'single', 'complete', 'average', 'ward'")
+        if self.metric != "euclidean":
+            raise NotImplementedError("AgglomerativeClustering only supports metric='euclidean'")
+    def _use_gpu_path(self) -> bool:
+        return self.device in (Device.CUDA, Device.TORCH)
+    def _check_gpu_memory(self, n_samples: int):
+        required = int(n_samples) * int(n_samples) * 8
+        if required > self._GPU_DISTANCE_LIMIT_BYTES:
+            limit_mb = self._GPU_DISTANCE_LIMIT_BYTES / (1024**2)
+            required_mb = required / (1024**2)
+            raise MemoryError(
+                "AgglomerativeClustering GPU exact path requires a dense "
+                f"distance matrix of about {required_mb:.1f} MiB, exceeding "
+                f"the configured limit {limit_mb:.1f} MiB. Use device='cpu' "
+                "or raise STATGPU_AGGLOMERATIVE_GPU_MAX_BYTES explicitly."
+            )
+    @staticmethod
+    def _labels_from_children(n_samples: int, n_clusters: int, children: np.ndarray) -> np.ndarray:
+        clusters = {i: [i] for i in range(n_samples)}
+        next_id = n_samples
+        merges_to_apply = max(0, n_samples - int(n_clusters))
+        for left, right in children[:merges_to_apply]:
+            members = clusters.pop(int(left)) + clusters.pop(int(right))
+            clusters[next_id] = members
+            next_id += 1
+        labels = np.empty(n_samples, dtype=np.int64)
+        for label, members in enumerate(clusters.values()):
+            labels[np.asarray(members, dtype=np.int64)] = label
+        return labels
+    @staticmethod
+    def _single_linkage_from_mst(
+        n_samples: int,
+        edge_parents: np.ndarray,
+        edge_children: np.ndarray,
+        edge_weights: np.ndarray,
+    ):
+        order = np.argsort(edge_weights, kind="mergesort")
+        uf_parent = list(range(n_samples))
+        cluster_ids = list(range(n_samples))
+        children = np.empty((n_samples - 1, 2), dtype=np.int64)
+        distances = np.empty(n_samples - 1, dtype=np.float64)
+        def find(idx: int) -> int:
+            while uf_parent[idx] != idx:
+                uf_parent[idx] = uf_parent[uf_parent[idx]]
+                idx = uf_parent[idx]
+            return idx
+        merge_step = 0
+        for edge_idx in order:
+            left_root = find(int(edge_parents[edge_idx]))
+            right_root = find(int(edge_children[edge_idx]))
+            if left_root == right_root:
+                continue
+            children[merge_step] = (cluster_ids[left_root], cluster_ids[right_root])
+            distances[merge_step] = float(edge_weights[edge_idx])
+            uf_parent[right_root] = left_root
+            cluster_ids[left_root] = n_samples + merge_step
+            merge_step += 1
+            if merge_step == n_samples - 1:
+                break
+        return children, distances
+    def _fit_gpu_single(self, backend, X_arr, n_samples: int):
+        D = backend.sqrt(squared_euclidean_distances(backend, X_arr))
+        inf = float("inf")
+        indices = backend.arange(n_samples, dtype=backend.int64)
+        D[indices, indices] = inf
+        selected = backend.zeros(n_samples, dtype=backend.bool)
+        selected[0] = True
+        min_dist = backend.copy(D[0, :])
+        min_dist[0] = inf
+        nearest_parent = backend.zeros(n_samples, dtype=backend.int64)
+        edge_parents = np.empty(n_samples - 1, dtype=np.int64)
+        edge_children = np.empty(n_samples - 1, dtype=np.int64)
+        edge_weights = np.empty(n_samples - 1, dtype=np.float64)
+        for step in range(n_samples - 1):
+            child = int(float(backend.argmin(min_dist)))
+            edge_children[step] = child
+            edge_parents[step] = int(float(nearest_parent[child]))
+            edge_weights[step] = float(min_dist[child])
+            selected[child] = True
+            candidate = D[child, :]
+            update_mask = (candidate < min_dist) & (~selected)
+            nearest_parent[update_mask] = child
+            min_dist = backend.where(update_mask, candidate, min_dist)
+            min_dist[child] = inf
+        return self._single_linkage_from_mst(n_samples, edge_parents, edge_children, edge_weights)
+    def _fit_gpu(self, X):
+        backend = self._get_backend()
+        X_arr = self._to_array(X, backend=backend.name)
+        X_arr = backend.asarray(X_arr, dtype=backend.float64)
+        check_2d_array(X_arr)
+        n_samples, n_features = X_arr.shape
+        self._validate_params(n_samples)
+        self._check_gpu_memory(n_samples)
+        if n_samples == 1:
+            self.labels_ = np.zeros(1, dtype=np.int64)
+            self.children_ = np.empty((0, 2), dtype=np.int64)
+            self.distances_ = np.empty((0,), dtype=np.float64)
+            self.n_features_in_ = int(n_features)
+            self._backend_name = backend.name
+            self._fitted = True
+            return self
+        if self.linkage == "single" and backend.name in ("cupy", "torch"):
+            children, distances = self._fit_gpu_single(backend, X_arr, n_samples)
+            self.children_ = children
+            self.distances_ = distances
+            self.labels_ = self._labels_from_children(n_samples, int(self.n_clusters), children)
+            self.n_features_in_ = int(n_features)
+            self._backend_name = backend.name
+            self._fitted = True
+            return self
+        D = squared_euclidean_distances(backend, X_arr)
+        if self.linkage != "ward":
+            D = backend.sqrt(D)
+        inf = float("inf")
+        indices = backend.arange(n_samples, dtype=backend.int64)
+        D[indices, indices] = inf
+        children = np.empty((n_samples - 1, 2), dtype=np.int64)
+        distances = np.empty(n_samples - 1, dtype=np.float64)
+        cluster_ids = list(range(n_samples))
+        cluster_sizes = [1.0] * n_samples
+        cluster_sizes_backend = (
+            backend.asarray(cluster_sizes, dtype=backend.float64) if self.linkage == "ward" else None
+        )
+        for step in range(n_samples - 1):
+            flat_idx = int(float(backend.argmin(D)))
+            a = flat_idx // n_samples
+            b = flat_idx % n_samples
+            if b < a:
+                a, b = b, a
+            merge_value = float(D[a, b])
+            children[step] = (cluster_ids[a], cluster_ids[b])
+            distances[step] = np.sqrt(max(merge_value, 0.0)) if self.linkage == "ward" else merge_value
+            da = D[a, :]
+            db = D[b, :]
+            size_a = cluster_sizes[a]
+            size_b = cluster_sizes[b]
+            if self.linkage == "single":
+                updated = backend.minimum(da, db)
+            elif self.linkage == "complete":
+                if backend.name in ("cupy", "torch"):
+                    backend.xp.maximum(da, db, out=da)
+                    updated = da
+                else:
+                    updated = backend.maximum(da, db)
+            elif self.linkage == "average":
+                if backend.name in ("cupy", "torch"):
+                    da *= size_a
+                    da += size_b * db
+                    da /= size_a + size_b
+                    updated = da
+                else:
+                    updated = (size_a * da + size_b * db) / (size_a + size_b)
+            else:
+                total = size_a + size_b + cluster_sizes_backend
+                updated = (
+                    ((cluster_sizes_backend + size_a) / total) * da
+                    + ((cluster_sizes_backend + size_b) / total) * db
+                    - (cluster_sizes_backend / total) * merge_value
+                )
+                updated = backend.maximum(updated, 0.0)
+            D[a, :] = updated
+            D[:, a] = updated
+            cluster_ids[a] = n_samples + step
+            cluster_sizes[a] += cluster_sizes[b]
+            cluster_sizes[b] = 0.0
+            if cluster_sizes_backend is not None:
+                cluster_sizes_backend[a] = cluster_sizes[a]
+                cluster_sizes_backend[b] = 0.0
+            D[b, :] = inf
+            D[:, b] = inf
+            D[a, a] = inf
+        self.children_ = children
+        self.distances_ = distances
+        self.labels_ = self._labels_from_children(n_samples, int(self.n_clusters), children)
+        self.n_features_in_ = int(n_features)
+        self._backend_name = backend.name
+        self._fitted = True
+        return self
+    def fit(self, X, y=None):
+        reject_sparse(X, "AgglomerativeClustering")
+        if self._use_gpu_path():
+            return self._fit_gpu(X)
+        X_arr = np.asarray(X, dtype=np.float64)
+        check_2d_array(X_arr)
+        n_samples, n_features = X_arr.shape
+        self._validate_params(n_samples)
+        if n_samples == 1:
+            children = np.empty((0, 2), dtype=np.int64)
+            distances = np.empty((0,), dtype=np.float64)
+            labels = np.zeros(1, dtype=np.int64)
+        else:
+            Z = linkage(X_arr, method=self.linkage, metric="euclidean")
+            children = Z[:, :2].astype(np.int64, copy=False)
+            distances = Z[:, 2].astype(np.float64, copy=False)
+            labels = fcluster(Z, t=int(self.n_clusters), criterion="maxclust").astype(np.int64) - 1
+        self.labels_ = labels
+        self.children_ = children
+        self.distances_ = distances
+        self.n_features_in_ = int(n_features)
+        self._backend_name = "numpy"
+        self._fitted = True
+        return self
+    def fit_predict(self, X, y=None):
+        return self.fit(X, y=y).labels_
+    def predict(self, X):
+        raise NotImplementedError("AgglomerativeClustering does not support predict for unseen samples")
+    def get_params(self, deep=True):
+        params = super().get_params(deep=deep)
+        params.update(
+            {
+                "n_clusters": self.n_clusters,
+                "linkage": self.linkage,
+                "metric": self.metric,
+            }
+        )
+        return params