PyPI - statgpu - Versions diffs - 0.1.0__py3-none-any.whl - Mend

statgpu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (168) hide show

statgpu/__init__.py +174 -0
statgpu/_base.py +544 -0
statgpu/_config.py +127 -0
statgpu/anova/__init__.py +5 -0
statgpu/anova/_oneway.py +194 -0
statgpu/backends/__init__.py +83 -0
statgpu/backends/_array_ops.py +529 -0
statgpu/backends/_base.py +184 -0
statgpu/backends/_cupy.py +453 -0
statgpu/backends/_factory.py +65 -0
statgpu/backends/_gpu_inference_cupy.py +214 -0
statgpu/backends/_gpu_inference_torch.py +422 -0
statgpu/backends/_numpy.py +324 -0
statgpu/backends/_torch.py +685 -0
statgpu/backends/_torch_safe.py +47 -0
statgpu/backends/_utils.py +423 -0
statgpu/core/__init__.py +10 -0
statgpu/core/formula/__init__.py +33 -0
statgpu/core/formula/_design.py +99 -0
statgpu/core/formula/_parser.py +191 -0
statgpu/core/formula/_terms.py +70 -0
statgpu/core/formula/tests/__init__.py +0 -0
statgpu/core/formula/tests/test_parser.py +194 -0
statgpu/covariance/__init__.py +6 -0
statgpu/covariance/_empirical.py +310 -0
statgpu/covariance/_shrinkage.py +248 -0
statgpu/cross_validation/__init__.py +31 -0
statgpu/cross_validation/_base.py +410 -0
statgpu/cross_validation/_engine.py +167 -0
statgpu/diagnostics/__init__.py +7 -0
statgpu/diagnostics/_regression_diagnostics.py +188 -0
statgpu/feature_selection/__init__.py +24 -0
statgpu/feature_selection/_knockoff.py +870 -0
statgpu/feature_selection/_knockoff_utils.py +1003 -0
statgpu/feature_selection/_stepwise.py +300 -0
statgpu/glm_core/__init__.py +81 -0
statgpu/glm_core/_base.py +202 -0
statgpu/glm_core/_family.py +362 -0
statgpu/glm_core/_fused.py +149 -0
statgpu/glm_core/_gamma.py +111 -0
statgpu/glm_core/_inverse_gaussian.py +62 -0
statgpu/glm_core/_irls.py +561 -0
statgpu/glm_core/_logistic.py +82 -0
statgpu/glm_core/_negative_binomial.py +68 -0
statgpu/glm_core/_poisson.py +60 -0
statgpu/glm_core/_solver_legacy.py +100 -0
statgpu/glm_core/_squared.py +53 -0
statgpu/glm_core/_tweedie.py +74 -0
statgpu/inference/__init__.py +239 -0
statgpu/inference/_distributions_backend.py +2610 -0
statgpu/inference/_multiple_testing.py +391 -0
statgpu/inference/_resampling.py +1400 -0
statgpu/inference/_results.py +265 -0
statgpu/linear_model/__init__.py +75 -0
statgpu/linear_model/_gaussian_inference.py +306 -0
statgpu/linear_model/_glm_base.py +1261 -0
statgpu/linear_model/_ordered_logit.py +52 -0
statgpu/linear_model/_ordered_probit.py +50 -0
statgpu/linear_model/_stats.py +170 -0
statgpu/linear_model/cv/__init__.py +13 -0
statgpu/linear_model/cv/_elasticnet_cv.py +892 -0
statgpu/linear_model/cv/_lasso_cv.py +253 -0
statgpu/linear_model/cv/_logistic_cv.py +895 -0
statgpu/linear_model/cv/_ridge_cv.py +1160 -0
statgpu/linear_model/legacy/__init__.py +1 -0
statgpu/linear_model/legacy/_distributions_legacy_gpu.py +340 -0
statgpu/linear_model/legacy/_elasticnet_legacy.py +936 -0
statgpu/linear_model/legacy/_lasso_legacy.py +4876 -0
statgpu/linear_model/legacy/_penalized_legacy.py +1174 -0
statgpu/linear_model/legacy/_ridge_legacy.py +863 -0
statgpu/linear_model/legacy/_solver_legacy.py +104 -0
statgpu/linear_model/penalized/__init__.py +25 -0
statgpu/linear_model/penalized/_base.py +437 -0
statgpu/linear_model/penalized/_fit_mixin.py +1877 -0
statgpu/linear_model/penalized/_inference_mixin.py +1179 -0
statgpu/linear_model/penalized/_penalized_cv.py +2699 -0
statgpu/linear_model/penalized/_penalized_gamma.py +86 -0
statgpu/linear_model/penalized/_penalized_inverse_gaussian.py +62 -0
statgpu/linear_model/penalized/_penalized_linear.py +236 -0
statgpu/linear_model/penalized/_penalized_logistic.py +100 -0
statgpu/linear_model/penalized/_penalized_negative_binomial.py +65 -0
statgpu/linear_model/penalized/_penalized_poisson.py +62 -0
statgpu/linear_model/penalized/_penalized_tweedie.py +65 -0
statgpu/linear_model/penalized/_predict_mixin.py +182 -0
statgpu/linear_model/wrappers/__init__.py +31 -0
statgpu/linear_model/wrappers/_adaptive_lasso.py +63 -0
statgpu/linear_model/wrappers/_elasticnet.py +75 -0
statgpu/linear_model/wrappers/_gamma.py +67 -0
statgpu/linear_model/wrappers/_inverse_gaussian.py +47 -0
statgpu/linear_model/wrappers/_lasso.py +2124 -0
statgpu/linear_model/wrappers/_linear.py +1127 -0
statgpu/linear_model/wrappers/_logistic.py +1435 -0
statgpu/linear_model/wrappers/_mcp.py +58 -0
statgpu/linear_model/wrappers/_negative_binomial.py +58 -0
statgpu/linear_model/wrappers/_poisson.py +48 -0
statgpu/linear_model/wrappers/_ridge.py +166 -0
statgpu/linear_model/wrappers/_scad.py +58 -0
statgpu/linear_model/wrappers/_tweedie.py +57 -0
statgpu/metrics/__init__.py +21 -0
statgpu/metrics/_classification.py +591 -0
statgpu/nonparametric/__init__.py +50 -0
statgpu/nonparametric/kernel_methods/__init__.py +25 -0
statgpu/nonparametric/kernel_methods/_kernels.py +246 -0
statgpu/nonparametric/kernel_methods/_krr.py +234 -0
statgpu/nonparametric/kernel_methods/_krr_cv.py +380 -0
statgpu/nonparametric/kernel_smoothing/__init__.py +39 -0
statgpu/nonparametric/kernel_smoothing/_bandwidth_selection.py +1083 -0
statgpu/nonparametric/kernel_smoothing/_kde.py +761 -0
statgpu/nonparametric/kernel_smoothing/_kernel_common.py +348 -0
statgpu/nonparametric/kernel_smoothing/_kernel_regression.py +748 -0
statgpu/nonparametric/splines/__init__.py +5 -0
statgpu/nonparametric/splines/_bspline_basis.py +336 -0
statgpu/nonparametric/splines/_penalized.py +349 -0
statgpu/panel/__init__.py +19 -0
statgpu/panel/_covariance.py +140 -0
statgpu/panel/_fixed_effects.py +420 -0
statgpu/panel/_random_effects.py +385 -0
statgpu/panel/_utils.py +482 -0
statgpu/penalties/__init__.py +139 -0
statgpu/penalties/_adaptive_l1.py +313 -0
statgpu/penalties/_base.py +261 -0
statgpu/penalties/_categories.py +39 -0
statgpu/penalties/_elasticnet.py +98 -0
statgpu/penalties/_group_lasso.py +678 -0
statgpu/penalties/_group_mcp.py +553 -0
statgpu/penalties/_group_scad.py +605 -0
statgpu/penalties/_l1.py +107 -0
statgpu/penalties/_l2.py +77 -0
statgpu/penalties/_mcp.py +237 -0
statgpu/penalties/_scad.py +260 -0
statgpu/semiparametric/__init__.py +5 -0
statgpu/semiparametric/_gam.py +401 -0
statgpu/solvers/__init__.py +24 -0
statgpu/solvers/_admm.py +241 -0
statgpu/solvers/_constants.py +15 -0
statgpu/solvers/_convergence.py +6 -0
statgpu/solvers/_fista.py +436 -0
statgpu/solvers/_fista_bb.py +513 -0
statgpu/solvers/_fista_lla.py +541 -0
statgpu/solvers/_lbfgs.py +206 -0
statgpu/solvers/_newton.py +149 -0
statgpu/solvers/_utils.py +277 -0
statgpu/survival/__init__.py +14 -0
statgpu/survival/_cox.py +3974 -0
statgpu/survival/_cox_breslow_triton_kernel.py +106 -0
statgpu/survival/_cox_cv.py +1159 -0
statgpu/survival/_cox_efron_cuda.py +1280 -0
statgpu/survival/_cox_efron_triton.py +359 -0
statgpu/unsupervised/__init__.py +29 -0
statgpu/unsupervised/_agglomerative.py +307 -0
statgpu/unsupervised/_dbscan.py +263 -0
statgpu/unsupervised/_dbscan_cpu.pyx +125 -0
statgpu/unsupervised/_gmm.py +332 -0
statgpu/unsupervised/_incremental_pca.py +176 -0
statgpu/unsupervised/_kmeans.py +261 -0
statgpu/unsupervised/_minibatch_kmeans.py +299 -0
statgpu/unsupervised/_minibatch_nmf.py +252 -0
statgpu/unsupervised/_nmf.py +190 -0
statgpu/unsupervised/_pca.py +189 -0
statgpu/unsupervised/_truncated_svd.py +132 -0
statgpu/unsupervised/_tsne.py +192 -0
statgpu/unsupervised/_umap.py +224 -0
statgpu/unsupervised/_utils.py +134 -0
statgpu-0.1.0.dist-info/METADATA +245 -0
statgpu-0.1.0.dist-info/RECORD +168 -0
statgpu-0.1.0.dist-info/WHEEL +5 -0
statgpu-0.1.0.dist-info/licenses/LICENSE +199 -0
statgpu-0.1.0.dist-info/top_level.txt +1 -0

statgpu/backends/_cupy.py ADDED Viewed

@@ -0,0 +1,453 @@
+"""
+CuPy GPU backend.
+"""
+import numpy as np
+from statgpu.backends._base import BackendBase
+from statgpu.backends._utils import _torch_to_cupy_dlpack
+class CuPyBackend(BackendBase):
+    """
+    GPU backend powered by CuPy.
+    Requires ``cupy`` (install via ``pip install statgpu[gpu11]`` for CUDA 11
+    or ``pip install statgpu[gpu12]`` for CUDA 12).
+    """
+    name = "cupy"
+    @property
+    def xp(self):
+        import cupy as cp  # deferred so import doesn't fail without cupy
+        return cp
+    def asarray(self, x, dtype=None):
+        import cupy as cp
+        if hasattr(x, "cpu"):
+            arr = _torch_to_cupy_dlpack(x)
+            if arr is not None:
+                return arr.astype(dtype, copy=False) if dtype is not None else arr
+            # PyTorch tensors expose a .cpu() method that moves the tensor to
+            # CPU memory before converting to NumPy.  Duck-typing avoids a
+            # mandatory torch import.
+            x = x.detach().cpu().numpy()
+        return cp.asarray(x, dtype=dtype)
+    def to_numpy(self, x) -> np.ndarray:
+        import cupy as cp
+        if isinstance(x, cp.ndarray):
+            return cp.asnumpy(x)
+        # Fallback for numpy or other array-likes
+        if hasattr(x, "get"):
+            return x.get()
+        return np.asarray(x)
+    def is_available(self) -> bool:
+        try:
+            import cupy as cp
+            cp.cuda.Device(0).use()
+            return True
+        except Exception:
+            return False
+    def lstsq(self, A, b, rcond=None):
+        import cupy as cp
+        # CuPy's lstsq signature matches NumPy's
+        return cp.linalg.lstsq(A, b, rcond=rcond)
+    def solve_triangular(self, A, b, lower=False, trans=False, unit_triangular=False):
+        """
+        Solve the triangular system Ax = b.
+        Parameters
+        ----------
+        A : cupy.ndarray
+            Triangular matrix (n, n).
+        b : cupy.ndarray
+            Right-hand side (n,) or (n, k).
+        lower : bool, default=False
+            Whether to use the lower triangle of A.
+        trans : bool, default=False
+            Whether to transpose A.
+        unit_triangular : bool, default=False
+            Whether to assume the diagonal of A is all ones.
+        Returns
+        -------
+        x : cupy.ndarray
+            Solution to the system.
+        """
+        import cupy as cp
+        # Use cupyx.scipy.linalg.solve_triangular for proper triangular solve
+        # This is much faster than generic solve for triangular systems
+        try:
+            from cupyx.scipy.linalg import solve_triangular
+            return solve_triangular(A, b, lower=lower, trans=trans, unit_diagonal=unit_triangular)
+        except ImportError:
+            # Fallback to generic solve if cupyx.scipy not available
+            return cp.linalg.solve(A, b)
+    # ------------------------------------------------------------------
+    # Helper methods for array operations
+    # ------------------------------------------------------------------
+    def sum(self, x, axis=None, keepdims=False):
+        """Sum over specified axis/axes."""
+        import cupy as cp
+        return cp.sum(x, axis=axis, keepdims=keepdims)
+    def mean(self, x, axis=None, keepdims=False):
+        """Mean over specified axis/axes."""
+        import cupy as cp
+        return cp.mean(x, axis=axis, keepdims=keepdims)
+    def sqrt(self, x):
+        """Element-wise square root."""
+        import cupy as cp
+        return cp.sqrt(x)
+    def abs(self, x):
+        """Element-wise absolute value."""
+        import cupy as cp
+        return cp.abs(x)
+    def max(self, x, axis=None, keepdims=False):
+        """Maximum value along axis."""
+        import cupy as cp
+        return cp.max(x, axis=axis, keepdims=keepdims)
+    def outer(self, a, b):
+        """Outer product."""
+        import cupy as cp
+        return cp.outer(a.flatten(), b.flatten())
+    def stack(self, arrays, axis=0):
+        """Stack arrays along a new axis."""
+        import cupy as cp
+        return cp.stack(arrays, axis=axis)
+    def zeros(self, shape, dtype=None):
+        """Create array of zeros."""
+        import cupy as cp
+        return cp.zeros(shape, dtype=dtype)
+    def arange(self, start, stop=None, step=1, dtype=None):
+        """Create range array."""
+        import cupy as cp
+        if stop is None:
+            result = cp.arange(start, step=step)
+        else:
+            result = cp.arange(start, stop, step=step)
+        if dtype is not None:
+            result = result.astype(dtype)
+        return result
+    def array(self, val, dtype=None):
+        """Create a scalar or array from a value."""
+        import cupy as cp
+        return cp.array(val, dtype=dtype)
+    def atleast_1d(self, x):
+        """Ensure array is at least 1D."""
+        import cupy as cp
+        return cp.atleast_1d(x)
+    @property
+    def newaxis(self):
+        """Alias for None, used in indexing."""
+        import cupy as cp
+        return cp.newaxis
+    @property
+    def float64(self):
+        """float64 dtype."""
+        import cupy as cp
+        return cp.float64
+    @property
+    def float32(self):
+        """float32 dtype."""
+        import cupy as cp
+        return cp.float32
+    @property
+    def int64(self):
+        """int64 dtype."""
+        import cupy as cp
+        return cp.int64
+    @property
+    def int32(self):
+        """int32 dtype."""
+        import cupy as cp
+        return cp.int32
+    def clip(self, x, min_val, max_val):
+        """Clip values to [min_val, max_val]."""
+        import cupy as cp
+        return cp.clip(x, min_val, max_val)
+    def minimum(self, x, y):
+        """Element-wise minimum of two arrays."""
+        import cupy as cp
+        return cp.minimum(x, y)
+    def maximum(self, x, y):
+        """Element-wise maximum of two arrays."""
+        import cupy as cp
+        return cp.maximum(x, y)
+    def matmul(self, a, b):
+        """Matrix multiplication."""
+        import cupy as cp
+        return cp.matmul(a, b)
+    def min(self, x, axis=None, keepdims=False):
+        """Minimum value along axis."""
+        import cupy as cp
+        return cp.min(x, axis=axis, keepdims=keepdims)
+    def expand_dims(self, x, axis):
+        """Expand array dimensions."""
+        import cupy as cp
+        return cp.expand_dims(x, axis)
+    def eigh(self, a):
+        """Eigenvalue decomposition for symmetric/Hermitian matrices."""
+        import cupy as cp
+        return cp.linalg.eigh(a)
+    def argmin(self, x, axis=None):
+        """Indices of minimum values along axis."""
+        import cupy as cp
+        return cp.argmin(x, axis=axis)
+    def argmax(self, x, axis=None):
+        """Indices of maximum values along axis."""
+        import cupy as cp
+        return cp.argmax(x, axis=axis)
+    def argsort(self, x, axis=-1):
+        """Indices that would sort the array."""
+        import cupy as cp
+        return cp.argsort(x, axis=axis)
+    def where(self, condition, x, y):
+        """Element-wise conditional selection."""
+        import cupy as cp
+        return cp.where(condition, x, y)
+    def flip(self, x, axis=None):
+        """Reverse array order along axis."""
+        import cupy as cp
+        return cp.flip(x, axis=axis)
+    def exp(self, x):
+        """Element-wise exponential."""
+        import cupy as cp
+        return cp.exp(x)
+    def log(self, x):
+        """Element-wise natural logarithm."""
+        import cupy as cp
+        return cp.log(x)
+    def copy(self, x):
+        """Return a copy of x."""
+        import cupy as cp
+        return x.copy()
+    def ones(self, shape, dtype=None):
+        """Create array of ones."""
+        import cupy as cp
+        return cp.ones(shape, dtype=dtype)
+    def full(self, shape, fill_value, dtype=None):
+        """Create array filled with a constant value."""
+        import cupy as cp
+        return cp.full(shape, fill_value, dtype=dtype)
+    def diag(self, x, k=0):
+        """Extract diagonal or create diagonal matrix."""
+        import cupy as cp
+        return cp.diag(x, k=k)
+    def transpose(self, x, axes=None):
+        """Transpose array."""
+        import cupy as cp
+        return cp.transpose(x, axes)
+    def eye(self, n, m=None, dtype=None):
+        """Create identity matrix."""
+        import cupy as cp
+        if m is None:
+            m = n
+        return cp.eye(n, m, dtype=dtype)
+    def cummin(self, arr, axis=0):
+        """Cumulative minimum along *axis* (GPU-native for small arrays)."""
+        import cupy as cp
+        if arr.size == 0 or arr.shape[axis] == 0:
+            return arr.copy()
+        if str(arr.dtype) not in _CUPY_CUMOP_DTYPES:
+            return cp.minimum.accumulate(arr, axis=axis)
+        if arr.ndim == 1:
+            return self._cumop_1d(arr, cp.minimum)
+        # Multi-dim: transpose target axis to last, scan, transpose back
+        if axis != arr.ndim - 1:
+            axes = list(range(arr.ndim))
+            axes[axis], axes[-1] = axes[-1], axes[axis]
+            arr = cp.transpose(arr, axes)
+            return cp.transpose(self._cumop_last_axis(arr, cp.minimum), axes)
+        return self._cumop_last_axis(arr, cp.minimum)
+    def cummax(self, arr, axis=0):
+        """Cumulative maximum along *axis* (GPU-native for small arrays)."""
+        import cupy as cp
+        if arr.size == 0 or arr.shape[axis] == 0:
+            return arr.copy()
+        if str(arr.dtype) not in _CUPY_CUMOP_DTYPES:
+            return cp.maximum.accumulate(arr, axis=axis)
+        if arr.ndim == 1:
+            return self._cumop_1d(arr, cp.maximum)
+        if axis != arr.ndim - 1:
+            axes = list(range(arr.ndim))
+            axes[axis], axes[-1] = axes[-1], axes[axis]
+            arr = cp.transpose(arr, axes)
+            return cp.transpose(self._cumop_last_axis(arr, cp.maximum), axes)
+        return self._cumop_last_axis(arr, cp.maximum)
+    @staticmethod
+    def _cumop_1d(arr, op):
+        """1D cumulative op using sequential write."""
+        import cupy as cp
+        # Ensure contiguous for CUDA kernel compatibility
+        if not arr.flags.c_contiguous:
+            arr = cp.ascontiguousarray(arr)
+        n = len(arr)
+        if n == 0:
+            return cp.empty_like(arr)
+        result = cp.empty_like(arr)
+        result[0] = arr[0]
+        if n > 1:
+            _launch_cumop_1d(arr, result, n, op is cp.minimum)
+        return result
+    @staticmethod
+    def _cumop_last_axis(arr, op):
+        """Cumulative op along last axis for N-D arrays."""
+        import cupy as cp
+        # Ensure contiguous for CUDA kernel compatibility
+        if not arr.flags.c_contiguous:
+            arr = cp.ascontiguousarray(arr)
+        shape = arr.shape
+        K = shape[-1]
+        if K == 0:
+            return cp.empty_like(arr)
+        flat = arr.reshape(-1, K)
+        N = flat.shape[0]
+        if N == 0:
+            return cp.empty_like(arr)
+        result = cp.empty_like(flat)
+        result[:, 0] = flat[:, 0]
+        if K > 1:
+            _launch_cumop_2d(flat, result, N, K, op is cp.minimum)
+        return result.reshape(shape)
+# ── Raw CUDA kernels for cumulative scan ──
+_cumop_1d_template = r'''
+extern "C" __global__
+void {name}(const {dtype}* __restrict__ x,
+            {dtype}* __restrict__ out, int n) {{
+    {dtype} cur = x[0];
+    out[0] = cur;
+    for (int j = 1; j < n; j++) {{
+        if ({cmp}) cur = x[j];
+        out[j] = cur;
+    }}
+}}
+'''
+_cumop_2d_template = r'''
+extern "C" __global__
+void {name}(const {dtype}* __restrict__ x,
+            {dtype}* __restrict__ out, int N, int K) {{
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= N) return;
+    const {dtype}* row = x + tid * K;
+    {dtype}* orow = out + tid * K;
+    {dtype} cur = row[0];
+    orow[0] = cur;
+    for (int j = 1; j < K; j++) {{
+        if ({cmp}) cur = row[j];
+        orow[j] = cur;
+    }}
+}}
+'''
+_CUPY_CUMOP_DTYPES = {
+    "float64": "double",
+    "float32": "float",
+    "int64": "long long",
+    "int32": "int",
+}
+_cumop_kernels = {}
+def _get_cumop_kernels(dtype):
+    dtype = str(dtype)
+    if dtype not in _CUPY_CUMOP_DTYPES:
+        raise TypeError(f"Unsupported dtype for CuPy cumop kernels: {dtype}")
+    if dtype in _cumop_kernels:
+        return _cumop_kernels[dtype]
+    import cupy as cp
+    ctype = _CUPY_CUMOP_DTYPES[dtype]
+    kmin1_mod = cp.RawModule(code=_cumop_1d_template.format(name="cummin_1d", dtype=ctype, cmp="x[j] < cur"))
+    kmax1_mod = cp.RawModule(code=_cumop_1d_template.format(name="cummax_1d", dtype=ctype, cmp="x[j] > cur"))
+    kmin2_mod = cp.RawModule(code=_cumop_2d_template.format(name="cummin_2d", dtype=ctype, cmp="row[j] < cur"))
+    kmax2_mod = cp.RawModule(code=_cumop_2d_template.format(name="cummax_2d", dtype=ctype, cmp="row[j] > cur"))
+    kernels = (
+        kmin1_mod.get_function('cummin_1d'),
+        kmax1_mod.get_function('cummax_1d'),
+        kmin2_mod.get_function('cummin_2d'),
+        kmax2_mod.get_function('cummax_2d'),
+    )
+    _cumop_kernels[dtype] = kernels
+    return kernels
+def _cumop_kernels_available(dtype=None):
+    """Check if CuPy cumop kernels can be compiled (lazy, caches on first call)."""
+    try:
+        _get_cumop_kernels(dtype or "float64")
+        return True
+    except Exception:
+        return False
+def _launch_cumop_1d(arr, result, n, is_min):
+    if arr is None or result is None:
+        raise RuntimeError(
+            "CuPy cumop kernels failed to compile or unavailable. "
+            "Cannot run cummin/cummax on this device."
+        )
+    kmin1, kmax1, _, _ = _get_cumop_kernels(arr.dtype)
+    kernel = kmin1 if is_min else kmax1
+    kernel((1,), (1,), (arr, result, n))
+def _launch_cumop_2d(arr, result, N, K, is_min):
+    if arr is None or result is None:
+        raise RuntimeError(
+            "CuPy cumop kernels failed to compile or unavailable. "
+            "Cannot run cummin/cummax on this device."
+        )
+    _, _, kmin2, kmax2 = _get_cumop_kernels(arr.dtype)
+    kernel = kmin2 if is_min else kmax2
+    block = min(N, 256)
+    grid = (N + block - 1) // block
+    kernel((grid,), (block,), (arr, result, N, K))

statgpu/backends/_factory.py ADDED Viewed

@@ -0,0 +1,65 @@
+"""
+Backend factory: select the appropriate compute backend automatically or
+explicitly by name.
+"""
+from statgpu.backends._base import BackendBase
+from statgpu.backends._numpy import NumpyBackend
+from statgpu.backends._cupy import CuPyBackend
+from statgpu.backends._torch import TorchBackend
+# Module-level singletons (one instance per library, shared across calls).
+_numpy_backend = NumpyBackend()
+_cupy_backend = CuPyBackend()
+_torch_backend = TorchBackend()
+def get_backend(backend: str = "auto", device: str = "auto") -> BackendBase:
+    """
+    Return a compute backend instance.
+    Parameters
+    ----------
+    backend : {'auto', 'numpy', 'cupy', 'torch'}, default='auto'
+        Which array library to use.
+        * ``'numpy'`` – always use NumPy (CPU).
+        * ``'cupy'``  – use CuPy (requires a CUDA GPU and the ``cupy`` package).
+        * ``'torch'`` – use PyTorch (requires the ``torch`` package; defaults
+          to CUDA if available, else CPU).
+        * ``'auto'``  – pick automatically: CuPy if available, else PyTorch
+          CUDA if available, else NumPy.
+    device : {'auto', 'cpu', 'cuda'}, default='auto'
+        Hint about the target device.  Ignored when *backend* is explicitly
+        set to a non-``'auto'`` value.  When ``'cpu'``, always returns the
+        NumPy backend regardless of GPU availability.
+    Returns
+    -------
+    BackendBase
+        A backend instance that can be used to create/convert arrays.
+    Examples
+    --------
+    >>> from statgpu.backends import get_backend
+    >>> xp = get_backend().xp      # numpy, cupy, or torch depending on hw
+    >>> arr = xp.zeros((3, 3))
+    """
+    if backend == "numpy":
+        return _numpy_backend
+    if backend == "cupy":
+        return _cupy_backend
+    if backend == "torch":
+        return _torch_backend
+    # --- auto-selection ---
+    if device == "cpu":
+        return _numpy_backend
+    # Prefer CuPy → PyTorch CUDA → NumPy
+    if _cupy_backend.is_available():
+        return _cupy_backend
+    if _torch_backend.is_available():
+        return _torch_backend
+    return _numpy_backend