PyPI - hapc - Versions diffs - 2.1.0__tar.gz → 2.3.1__tar.gz - Mend

hapc 2.1.0tar.gz → 2.3.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

{hapc-2.1.0 → hapc-2.3.1}/CMakeLists.txt RENAMED Viewed

@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.15)
+cmake_minimum_required(VERSION 3.18)
 project(hapc)
 set(CMAKE_CXX_STANDARD 17)
@@ -15,7 +15,13 @@ endif()
 # Python3_EXECUTABLE from setup.py so the build always targets the *same*
 # interpreter that pip is using.  Without this CMake may discover a newer/
 # older system Python and produce a .so tagged for the wrong ABI.
-find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
+#
+# Use Development.Module (headers only), NOT the full Development component:
+# the latter also requires Development.Embed -> libpython, which manylinux
+# images deliberately do not ship (extension modules must not link libpython).
+# Requiring full Development makes the manylinux build fail with
+# "Could NOT find Python3 (missing: Python3_LIBRARIES Development.Embed)".
+find_package(Python3 COMPONENTS Interpreter Development.Module REQUIRED)
 message(STATUS "Python3_EXECUTABLE: ${Python3_EXECUTABLE}")
 message(STATUS "Python3_VERSION: ${Python3_VERSION}")

{hapc-2.1.0/python/hapc.egg-info → hapc-2.3.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hapc
-Version: 2.1.0
+Version: 2.3.1
 Summary: Highly Adaptive Principal Components
 Home-page: https://github.com/meixide/hapc
 Author: Carlos García Meixide
@@ -51,16 +51,41 @@ A fast and flexible machine learning library for nonparametric high-dimensional
 pip install hapc
 ```
+Prebuilt wheels are published for Linux (manylinux2014, x86_64), macOS
+(Intel + Apple Silicon) and Windows, for CPython 3.8–3.12. No compiler,
+CMake or Eigen is needed when a wheel is available.
+### Linux / HPC clusters
+The Linux wheels use the **manylinux2014** baseline (glibc 2.17), so
+`pip install hapc` works out of the box on HPC login/compute nodes —
+no `conda` toolchain, `devtoolset`, or sysroot setup required:
+```bash
+pip install hapc
+```
+If you must build from the source distribution (niche architecture, very
+old Python, or an air-gapped node), provide a C++17 compiler and either
+let CMake fetch Eigen automatically (needs network) or install Eigen and
+let `find_package(Eigen3)` find it:
+```bash
+# with conda compilers (recommended on HPC)
+conda install -c conda-forge cxx-compiler cmake eigen
+pip install hapc --no-binary hapc
+```
 ### Install from GitHub (latest development version)
 ```bash
-pip install git+https://github.com/yourusername/hapc.git
+pip install git+https://github.com/meixide/hapc.git
 ```
 Or with editable install for development:
 ```bash
-git clone https://github.com/yourusername/hapc.git
+git clone https://github.com/meixide/hapc.git
 cd hapc
 pip install -e .
 ```
@@ -201,7 +226,7 @@ Cross-validation to select lambda.
 Contributions welcome! The C++ core is shared between R and Python packages.
 ```bash
-git clone https://github.com/yourusername/hapc.git
+git clone https://github.com/meixide/hapc.git
 cd hapc
 pip install -e .
 pytest

{hapc-2.1.0 → hapc-2.3.1}/README.md RENAMED Viewed

@@ -17,16 +17,41 @@ A fast and flexible machine learning library for nonparametric high-dimensional
 pip install hapc
 ```
+Prebuilt wheels are published for Linux (manylinux2014, x86_64), macOS
+(Intel + Apple Silicon) and Windows, for CPython 3.8–3.12. No compiler,
+CMake or Eigen is needed when a wheel is available.
+### Linux / HPC clusters
+The Linux wheels use the **manylinux2014** baseline (glibc 2.17), so
+`pip install hapc` works out of the box on HPC login/compute nodes —
+no `conda` toolchain, `devtoolset`, or sysroot setup required:
+```bash
+pip install hapc
+```
+If you must build from the source distribution (niche architecture, very
+old Python, or an air-gapped node), provide a C++17 compiler and either
+let CMake fetch Eigen automatically (needs network) or install Eigen and
+let `find_package(Eigen3)` find it:
+```bash
+# with conda compilers (recommended on HPC)
+conda install -c conda-forge cxx-compiler cmake eigen
+pip install hapc --no-binary hapc
+```
 ### Install from GitHub (latest development version)
 ```bash
-pip install git+https://github.com/yourusername/hapc.git
+pip install git+https://github.com/meixide/hapc.git
 ```
 Or with editable install for development:
 ```bash
-git clone https://github.com/yourusername/hapc.git
+git clone https://github.com/meixide/hapc.git
 cd hapc
 pip install -e .
 ```
@@ -167,7 +192,7 @@ Cross-validation to select lambda.
 Contributions welcome! The C++ core is shared between R and Python packages.
 ```bash
-git clone https://github.com/yourusername/hapc.git
+git clone https://github.com/meixide/hapc.git
 cd hapc
 pip install -e .
 pytest

hapc-2.3.1/pyproject.toml ADDED Viewed

@@ -0,0 +1,70 @@
+[build-system]
+requires = ["setuptools>=65", "wheel", "cmake>=3.18", "pybind11>=2.6"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "hapc"
+version = "2.3.1"
+description = "Highly Adaptive Principal Components"
+readme = "README.md"
+requires-python = ">=3.8"
+authors = [
+    {name = "Carlos García Meixide", email = "cgmeixide@gmail.com"}
+]
+license = {text = "MIT"}
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Operating System :: OS Independent",
+]
+dependencies = [
+    "numpy>=1.24,<2.3",
+    "scikit-learn>=1.0",
+]
+[project.optional-dependencies]
+dev = ["pytest", "pytest-cov", "black", "flake8"]
+[project.urls]
+Homepage = "https://github.com/meixide/hapc"
+Documentation = "https://github.com/meixide/hapc#readme"
+Repository = "https://github.com/meixide/hapc.git"
+Issues = "https://github.com/meixide/hapc/issues"
+[tool.cibuildwheel]
+# Build CPython 3.8–3.12 only; skip PyPy and musl (HPC/desktop targets are glibc).
+build = "cp38-* cp39-* cp310-* cp311-* cp312-*"
+skip = ["pp*", "*-musllinux*"]
+build-verbosity = 1
+# Smoke-test every wheel: install it (pulling numpy/scikit-learn) and import the
+# compiled extension. Catches wrong-ABI / unresolved-symbol wheels before publish.
+test-command = "python -c \"import hapc; print(hapc.__version__)\""
+# NOTE: the latest scipy (transitive via scikit-learn) no longer ships a
+# manylinux2014 wheel, so the in-container smoke test would try to compile it
+# from source and fail. We force scipy to resolve from a binary wheel via
+# PIP_ONLY_BINARY=scipy, set in the workflow and passed into the Linux container
+# (see .github/workflows/build-and-publish.yml). It must apply to the wheel
+# install itself, which is why it lives in the env rather than test-requires.
+[tool.cibuildwheel.linux]
+archs = ["x86_64"]
+# manylinux2014 -> glibc 2.17 baseline + bundled libstdc++ via auditwheel, so the
+# wheel installs and runs unmodified on any HPC cluster (glibc >= 2.17), no
+# compiler / conda toolchain / sysroot required.
+manylinux-x86_64-image = "manylinux2014"
+[tool.cibuildwheel.macos]
+# Build fat universal2 wheels (x86_64 + arm64) from a single runner. CMake does
+# not honour the interpreter's arch flags, so the arch is forced explicitly via
+# CMAKE_ARGS below (setup.py appends $CMAKE_ARGS to the cmake invocation).
+# delocate then verifies both slices are present, which is what caught the old
+# single-arch-but-universal2-tagged wheels.
+archs = ["universal2"]
+environment = { CMAKE_ARGS = "-DCMAKE_OSX_ARCHITECTURES=arm64;x86_64" }
+[tool.cibuildwheel.windows]
+archs = ["AMD64"]

{hapc-2.1.0 → hapc-2.3.1}/python/hapc/__init__.py RENAMED Viewed

@@ -19,7 +19,7 @@ Lower-level building blocks:
 - :func:`ate_hapc` — ATE estimate + Wald CI via HAPC + outcome undersmoothing.
 """
-__version__ = "2.1.0"
+__version__ = "2.3.1"
 from .core import (
     DesignOutput,

{hapc-2.1.0 → hapc-2.3.1}/python/hapc/cv.py RENAMED Viewed

@@ -18,7 +18,11 @@ import numpy as np
 from . import hapc_core
 from .core import _C, cross_kernel_hapc, design_hapc
-from .single import single_pcghal_classification_lasso
+from .single import (
+    _check_binomial_labels,
+    _to_soft01,
+    single_pcghal_classification_lasso,
+)
 class CVResult(NamedTuple):
@@ -376,6 +380,9 @@ def pcghal_cv_classi_lasso(X: np.ndarray, Y: np.ndarray,
     if not np.all(lams > 0):
         raise ValueError("All lambdas must be > 0 for logistic LASSO.")
+    # Soft target in [0,1] used for the held-out cross-entropy deviance
+    # (accepts hard {0,1}/{-1,+1} or fractional EM-HAL posteriors).
+    q = _to_soft01(Y)
     folds = _native_folds(n, int(nfolds))
     L = lams.size
     fold_dev = np.full((int(nfolds), L), np.nan)
@@ -386,7 +393,7 @@ def pcghal_cv_classi_lasso(X: np.ndarray, Y: np.ndarray,
         if te.size == 0 or tr.size == 0:
             continue
         Xtr, Ytr = X[tr], Y[tr]
-        Xte, Yte = X[te], Y[te]
+        Xte, Yte = X[te], q[te]
         for j, lam in enumerate(lams):
             res = single_pcghal_classification_lasso(
@@ -395,9 +402,7 @@ def pcghal_cv_classi_lasso(X: np.ndarray, Y: np.ndarray,
                 verbose=bool(verbose), max_iter=int(max_iter),
             )
             probs = np.clip(res.probabilities, 1e-15, 1 - 1e-15)
-            yte01 = (Yte == 1).astype(np.float64) if set(np.unique(Yte).tolist()).issubset({0.0, 1.0}) \
-                else (Yte > 0).astype(np.float64)
-            dev = -(yte01 * np.log(probs) + (1 - yte01) * np.log(1 - probs))
+            dev = -(Yte * np.log(probs) + (1 - Yte) * np.log(1 - probs))
             fold_dev[k - 1, j] = float(dev.mean())
     deviances = np.nanmean(fold_dev, axis=0)
@@ -500,6 +505,8 @@ def cv_hapc(X: np.ndarray, Y: np.ndarray,
     lams = _grid(None, log_lambda_min, log_lambda_max, grid_length)
     if family == "binomial":
+        # Validate labels; allow soft labels in [0,1] only for norm in {"1","2"}.
+        _check_binomial_labels(Y, norm)
         if norm in {"sv", "2"}:
             return pcghal_cv_classi(
                 X, Y, max_degree=max_degree, npcs=npcs,

{hapc-2.1.0 → hapc-2.3.1}/python/hapc/single.py RENAMED Viewed

@@ -95,6 +95,61 @@ def _to_pm1(Y: np.ndarray, *, verbose: bool = False) -> np.ndarray:
     )
+def _label_kind(Y: np.ndarray) -> str:
+    """Classify a binomial response vector.
+    Returns ``"01"`` (hard labels in ``{0,1}``), ``"pm1"`` (hard labels in
+    ``{-1,+1}``), or ``"soft"`` (fractional labels in ``[0,1]``, e.g. EM-HAL
+    E-step posteriors). Raises ``ValueError`` if any value falls outside
+    ``[0,1]`` and the set is not exactly ``{-1,+1}``.
+    """
+    Y = np.asarray(Y, dtype=np.float64).ravel()
+    u = np.unique(Y[~np.isnan(Y)])
+    s = set(u.tolist())
+    if s.issubset({0.0, 1.0}):
+        return "01"
+    if s == {-1.0, 1.0}:
+        return "pm1"
+    if u.size and u.min() >= 0.0 and u.max() <= 1.0:
+        return "soft"
+    raise ValueError(
+        "family='binomial' requires Y in {0,1}, {-1,+1}, or soft labels in "
+        "[0,1]; found values outside [0,1]."
+    )
+def _to_soft01(Y: np.ndarray) -> np.ndarray:
+    """Map a binomial response to a soft cross-entropy target in ``[0,1]``."""
+    Y = np.asarray(Y, dtype=np.float64).ravel()
+    return (Y + 1.0) / 2.0 if _label_kind(Y) == "pm1" else Y
+def _check_binomial_labels(Y: np.ndarray, norm: str) -> str:
+    """Validate labels and enforce the soft-label norm restriction.
+    Soft labels (any value strictly inside ``(0,1)``) are supported only for
+    ``norm`` in ``{"1","2"}``; ``norm="sv"`` raises ``NotImplementedError``.
+    A warning is emitted whenever soft labels are detected. Returns the label
+    kind from :func:`_label_kind`.
+    """
+    import warnings
+    kind = _label_kind(Y)
+    if kind == "soft":
+        if norm == "sv":
+            raise NotImplementedError(
+                "Soft labels (Y in (0,1)) are not implemented for norm='sv'; "
+                "use norm='1' or norm='2'."
+            )
+        warnings.warn(
+            "Non-binary labels detected in Y: treating them as soft labels in "
+            "[0,1] (cross-entropy target). Supported only for norm='1' and "
+            "norm='2'.",
+            stacklevel=2,
+        )
+    return kind
 def _calibrate_logistic_intercept(y01: np.ndarray, eta: np.ndarray) -> float:
     """Newton calibration for intercept with fixed linear predictor ``eta``."""
     y01 = np.asarray(y01, dtype=np.float64).ravel()
@@ -367,24 +422,21 @@ def single_pcghal_classification_ridge_only(
     SinglePcghalClassificationResult
     """
     X, Y, n, p = _check_xy(X, Y)
-    Y_pm1 = _to_pm1(Y, verbose=verbose)
+    # Accept hard {0,1}/{-1,+1} or soft [0,1] labels (cross-entropy target).
+    y01 = _to_soft01(Y)
     des = design_hapc(X, max_degree, npcs, center=center)
     final_npc = des.d.shape[0]
     Xtilde = des.U[:, :final_npc] * des.d[:final_npc]
     alpha = np.asarray(
-        hapc_core.logistic_ridge_init(_C(Y_pm1), _C(Xtilde), float(lambda_))
+        hapc_core.logistic_ridge_init_y01(_C(y01), _C(Xtilde), float(lambda_))
     ).ravel()
     eta = Xtilde @ alpha
-    y01 = (Y_pm1 > 0).astype(np.float64)
     b0 = _calibrate_logistic_intercept(y01, eta)
-    ymu = Y_pm1 * (eta + b0)
-    risk = float(
-        np.where(ymu > 0, np.log1p(np.exp(-ymu)), -ymu + np.log1p(np.exp(ymu)))
-        .mean()
-    )
+    phat = np.clip(1.0 / (1.0 + np.exp(-(eta + b0))), 1e-15, 1 - 1e-15)
+    risk = float((-(y01 * np.log(phat) + (1 - y01) * np.log(1 - phat))).mean())
     predictions = probabilities = predicted_classes = None
     if predict is not None:
@@ -480,13 +532,26 @@ def single_pcghal_classification_lasso(
         raise ValueError(f"lambda_ must be > 0 for LASSO; got {lambda_}")
     X, Y, n, p = _check_xy(X, Y)
-    Y_pm1 = _to_pm1(Y, verbose=verbose)
-    Y_01 = (Y_pm1 > 0).astype(np.int64)
+    # Accept hard {0,1}/{-1,+1} or soft [0,1] labels (cross-entropy target).
+    q = _to_soft01(Y)
     des = design_hapc(X, max_degree, npcs, center=center)
     final_npc = des.d.shape[0]
     Xtilde = des.U[:, :final_npc] * des.d[:final_npc]
+    # For soft labels, replicate each row as a (label=1, weight=q) and
+    # (label=0, weight=1-q) pair so the sample-weighted logistic loss equals
+    # the soft cross-entropy. On hard labels this reduces to the plain fit.
+    is_soft = bool(np.any((q > 1e-12) & (q < 1.0 - 1e-12)))
+    if is_soft:
+        Xfit = _C(np.vstack([Xtilde, Xtilde]))
+        yfit = np.concatenate([np.ones(n), np.zeros(n)]).astype(np.int64)
+        wfit = np.concatenate([q, 1.0 - q]).astype(np.float64)
+    else:
+        Xfit = _C(Xtilde)
+        yfit = (q > 0.5).astype(np.int64)
+        wfit = None
     C = 1.0 / (n * float(lambda_))
     # sklearn>=1.8 deprecated penalty="l1" in favour of l1_ratio=1 with the
     # liblinear solver; older versions still need penalty="l1". Try the new
@@ -495,24 +560,28 @@ def single_pcghal_classification_lasso(
     sig_params = inspect.signature(LogisticRegression).parameters
     common_kw = dict(solver="liblinear", C=C, fit_intercept=False,
                      max_iter=int(max_iter))
+    def _fit(**ctor):
+        m = LogisticRegression(**ctor, **common_kw)
+        if wfit is None:
+            m.fit(Xfit, yfit)
+        else:
+            m.fit(Xfit, yfit, sample_weight=wfit)
+        return m
     if "l1_ratio" in sig_params and "penalty" in sig_params:
         try:
-            model = LogisticRegression(l1_ratio=1.0, **common_kw)
-            model.fit(_C(Xtilde), Y_01)
+            model = _fit(l1_ratio=1.0)
         except (TypeError, ValueError):
-            model = LogisticRegression(penalty="l1", **common_kw)
-            model.fit(_C(Xtilde), Y_01)
+            model = _fit(penalty="l1")
     else:  # pragma: no cover  (very old sklearn)
-        model = LogisticRegression(penalty="l1", **common_kw)
-        model.fit(_C(Xtilde), Y_01)
+        model = _fit(penalty="l1")
     alpha = np.asarray(model.coef_, dtype=np.float64).ravel()
-    b0 = _calibrate_logistic_intercept(Y_01.astype(np.float64), Xtilde @ alpha)
+    b0 = _calibrate_logistic_intercept(q, Xtilde @ alpha)
     eta = Xtilde @ alpha + b0
-    ymu = Y_pm1 * eta
-    risk = float(
-        np.where(ymu > 0, np.log1p(np.exp(-ymu)), -ymu + np.log1p(np.exp(ymu))).mean()
-    )
+    phat = np.clip(1.0 / (1.0 + np.exp(-eta)), 1e-15, 1 - 1e-15)
+    risk = float((-(q * np.log(phat) + (1 - q) * np.log(1 - phat))).mean())
     predictions = probabilities = predicted_classes = None
     if predict is not None:
@@ -560,8 +629,10 @@ def hapc(X: np.ndarray, Y: np.ndarray,
     X : np.ndarray, shape (n, p)
         Features.
     Y : np.ndarray, shape (n,)
-        Response. For ``family="binomial"`` must contain only ``{0,1}`` or
-        ``{-1,+1}``.
+        Response. For ``family="binomial"``: hard labels in ``{0,1}`` or
+        ``{-1,+1}``, or soft labels in ``[0,1]`` (e.g. EM-HAL E-step
+        posteriors). Soft labels are supported only for ``norm`` in
+        ``{"1","2"}``; ``norm="sv"`` requires hard labels.
     family : {"gaussian", "binomial"}, default "gaussian"
         Loss family.
     max_degree : int, default 1
@@ -617,6 +688,8 @@ def hapc(X: np.ndarray, Y: np.ndarray,
         npcs = int(X.shape[0])
     if family == "binomial":
+        # Validate labels; allow soft labels in [0,1] only for norm in {"1","2"}.
+        _check_binomial_labels(Y, norm)
         if norm == "sv":
             return single_pcghal_classification(
                 X, Y, max_degree, npcs, lambda_,

{hapc-2.1.0 → hapc-2.3.1/python/hapc.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hapc
-Version: 2.1.0
+Version: 2.3.1
 Summary: Highly Adaptive Principal Components
 Home-page: https://github.com/meixide/hapc
 Author: Carlos García Meixide
@@ -51,16 +51,41 @@ A fast and flexible machine learning library for nonparametric high-dimensional
 pip install hapc
 ```
+Prebuilt wheels are published for Linux (manylinux2014, x86_64), macOS
+(Intel + Apple Silicon) and Windows, for CPython 3.8–3.12. No compiler,
+CMake or Eigen is needed when a wheel is available.
+### Linux / HPC clusters
+The Linux wheels use the **manylinux2014** baseline (glibc 2.17), so
+`pip install hapc` works out of the box on HPC login/compute nodes —
+no `conda` toolchain, `devtoolset`, or sysroot setup required:
+```bash
+pip install hapc
+```
+If you must build from the source distribution (niche architecture, very
+old Python, or an air-gapped node), provide a C++17 compiler and either
+let CMake fetch Eigen automatically (needs network) or install Eigen and
+let `find_package(Eigen3)` find it:
+```bash
+# with conda compilers (recommended on HPC)
+conda install -c conda-forge cxx-compiler cmake eigen
+pip install hapc --no-binary hapc
+```
 ### Install from GitHub (latest development version)
 ```bash
-pip install git+https://github.com/yourusername/hapc.git
+pip install git+https://github.com/meixide/hapc.git
 ```
 Or with editable install for development:
 ```bash
-git clone https://github.com/yourusername/hapc.git
+git clone https://github.com/meixide/hapc.git
 cd hapc
 pip install -e .
 ```
@@ -201,7 +226,7 @@ Cross-validation to select lambda.
 Contributions welcome! The C++ core is shared between R and Python packages.
 ```bash
-git clone https://github.com/yourusername/hapc.git
+git clone https://github.com/meixide/hapc.git
 cd hapc
 pip install -e .
 pytest

{hapc-2.1.0 → hapc-2.3.1}/setup.py RENAMED Viewed

@@ -3,6 +3,7 @@
 from setuptools import setup, find_packages, Extension
 from setuptools.command.build_ext import build_ext
 import os
+import shlex
 import subprocess
 import sys
 from pathlib import Path
@@ -44,6 +45,15 @@ class CMakeBuild(build_ext):
         build_args = ['--config', cfg]
         cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg]
+        # Honour the conventional CMAKE_ARGS env var (set by cibuildwheel/conda).
+        # Used to force universal2 macOS builds via
+        # CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64;x86_64", which CMake cannot
+        # infer from the (single-arch) build interpreter on its own.
+        extra_cmake_args = os.environ.get('CMAKE_ARGS')
+        if extra_cmake_args:
+            cmake_args += shlex.split(extra_cmake_args)
         # Add parallel build flag only on non-Windows platforms
         # On Windows, MSBuild doesn't support -j flag and handles parallelization automatically
         if sys.platform != 'win32':

{hapc-2.1.0 → hapc-2.3.1}/src/bindings.cpp RENAMED Viewed

@@ -117,4 +117,9 @@ PYBIND11_MODULE(hapc_core, m) {
     m.def("logistic_ridge_init", &logistic_ridge_init,
           py::arg("Y"), py::arg("X"), py::arg("lambda"));
+    // Soft-label logistic ridge initialiser: target Y may be any value in
+    // [0,1] (hard {0,1} labels or fractional EM-HAL E-step posteriors).
+    m.def("logistic_ridge_init_y01", &logistic_ridge_init_y01,
+          py::arg("Y"), py::arg("X"), py::arg("lambda"));
 }

{hapc-2.1.0 → hapc-2.3.1}/src/hapc_core.hpp RENAMED Viewed

@@ -91,6 +91,11 @@ FastCVOutput fasthal_cv_python(const MatrixXd& X, const VectorXd& Y, int npc,
 // (internally multiplied by n, matching logistic_call).
 VectorXd logistic_ridge_init(const VectorXd& Y_pm1, const MatrixXd& X, double lambda);
+// Soft-label variant: target `y01` may take any value in [0, 1] (hard {0,1}
+// labels or fractional EM-HAL E-step posteriors). On hard {0,1} inputs the
+// result is identical to logistic_ridge_init. lambda has the same scaling.
+VectorXd logistic_ridge_init_y01(const VectorXd& y01, const MatrixXd& X, double lambda);
 // Cross-validation output for binomial (logistic) HAPC.
 struct CVClassiOutput {
     std::vector<double> deviances;
@@ -101,7 +106,9 @@ struct CVClassiOutput {
 };
 // Python-friendly binomial CV (mirrors R `pchal_cv_classi_call`).
-// Y must contain only 0 or 1 values.
+// Y must lie in [0,1]: hard {0,1} labels or soft EM-HAL posteriors. Soft
+// labels are supported only when with_pgd == false (norm="2"); with_pgd ==
+// true (norm="sv") rejects soft labels.
 //
 // When `with_pgd == true` (default): per fold runs logistic-ridge initialiser
 // followed by projected gradient descent on logistic loss (norm="sv").

{hapc-2.1.0 → hapc-2.3.1}/src/pcghal_cv_classi_cpp.cpp RENAMED Viewed

@@ -28,10 +28,15 @@
 // rule `beta := delta_beta` (i.e. solving the full normal equation each
 // iteration, treating the IRLS working response as the regression target).
 // ---------------------------------------------------------------------------
-VectorXd logistic_ridge_init(const VectorXd& Y_pm1, const MatrixXd& X, double lambda) {
+// Soft-label logistic ridge.  The target `y01` may take any value in [0, 1]:
+// hard {0,1} labels or fractional EM-HAL E-step posteriors.  The IRLS update
+// is unchanged; fractional targets are standard for cross-entropy
+// minimisation, so on hard {0,1} inputs the result is bit-identical to the
+// former {-1,+1} implementation.
+VectorXd logistic_ridge_init_y01(const VectorXd& y01, const MatrixXd& X, double lambda) {
     const int n = X.rows();
     const int p = X.cols();
-    if (Y_pm1.size() != n) {
+    if (y01.size() != n) {
         throw std::runtime_error("logistic_ridge_init: Y length must match nrow(X).");
     }
     // Match logistic_call: lambda is multiplied by n internally.
@@ -39,12 +44,6 @@ VectorXd logistic_ridge_init(const VectorXd& Y_pm1, const MatrixXd& X, double la
     const int max_iter = 100;
     const double tol = 1e-8;
-    // logistic_call expects Y in {-1,+1} but treats it via the GLM update with
-    // the {0,1} working response.  We replicate that behaviour exactly: convert
-    // back to a {0,1} response y01 = (Y_pm1 + 1) / 2 to compute mu/working z.
-    VectorXd y01(n);
-    for (int i = 0; i < n; ++i) y01[i] = (Y_pm1[i] > 0) ? 1.0 : 0.0;
     VectorXd beta = VectorXd::Zero(p);
     for (int iter = 0; iter < max_iter; ++iter) {
         VectorXd eta = X * beta;
@@ -66,6 +65,15 @@ VectorXd logistic_ridge_init(const VectorXd& Y_pm1, const MatrixXd& X, double la
     return beta;
 }
+// Backward-compatible wrapper: accepts Y in {-1,+1} and converts to {0,1}.
+// Used by the PGD (norm="sv") single-fit path, which is hard-label only.
+VectorXd logistic_ridge_init(const VectorXd& Y_pm1, const MatrixXd& X, double lambda) {
+    const int n = X.rows();
+    VectorXd y01(n);
+    for (int i = 0; i < n; ++i) y01[i] = (Y_pm1[i] > 0) ? 1.0 : 0.0;
+    return logistic_ridge_init_y01(y01, X, lambda);
+}
 static double calibrate_logistic_intercept(const VectorXd& Y01,
                                            const VectorXd& eta) {
     const int n = (int)Y01.size();
@@ -84,16 +92,20 @@ static double calibrate_logistic_intercept(const VectorXd& Y01,
     return b0;
 }
-static double logistic_risk_pm1(const VectorXd& Y_pm1, const VectorXd& eta) {
-    const int n = (int)Y_pm1.size();
+// Soft cross-entropy risk for fractional targets y01 in [0,1], given a linear
+// predictor `eta` (intercept already folded in).  On hard {0,1} labels this
+// equals the former {-1,+1} logistic risk, so behaviour is unchanged on
+// binary inputs.
+static double logistic_risk_y01(const VectorXd& y01, const VectorXd& eta) {
+    const int n = (int)y01.size();
     if (eta.size() != n) {
-        throw std::runtime_error("logistic_risk_pm1: length mismatch");
+        throw std::runtime_error("logistic_risk_y01: length mismatch");
     }
     double risk = 0.0;
     for (int i = 0; i < n; ++i) {
-        const double ymu = Y_pm1[i] * eta[i];
-        risk += (ymu > 0) ? std::log1p(std::exp(-ymu))
-                          : -ymu + std::log1p(std::exp(ymu));
+        const double pi = 1.0 / (1.0 + std::exp(-eta[i]));
+        const double p = std::min(1.0 - 1e-15, std::max(1e-15, pi));
+        risk += -(y01[i] * std::log(p) + (1.0 - y01[i]) * std::log(1.0 - p));
     }
     return risk / n;
 }
@@ -136,28 +148,31 @@ static std::vector<int> make_folds(int n, int K) {
 // for the post-CV refit). When `with_pgd == false`, returns the logistic-ridge
 // initialiser α directly with its training logistic risk; otherwise runs the
 // PGD step on top of it (norm="sv").
-static OptimizerOutput logistic_full_fit(const VectorXd& Y_pm1,
+static OptimizerOutput logistic_full_fit(const VectorXd& Y01,
                                           const MatrixXd& Xtilde,
                                           const MatrixXd& E_Nn,
                                           double lambda,
                                           int max_iter, double tol,
                                           double step_factor, bool verbose,
                                           bool with_pgd) {
-    VectorXd alpha0 = logistic_ridge_init(Y_pm1, Xtilde, lambda);
+    VectorXd alpha0 = logistic_ridge_init_y01(Y01, Xtilde, lambda);
     const int n = Xtilde.rows();
     VectorXd alpha_fit;
     if (with_pgd) {
+        // PGD (norm="sv") uses the {-1,+1} logistic loss and is reached only
+        // for hard labels (soft labels are rejected upstream), so thresholding
+        // at 0.5 recovers the exact {-1,+1} encoding.
+        VectorXd Y_pm1(n);
+        for (int i = 0; i < n; ++i) Y_pm1[i] = (Y01[i] > 0.5) ? 1.0 : -1.0;
         OptimizerOutput out = pcghal_classi_call(Y_pm1, Xtilde, E_Nn, alpha0,
                                                  max_iter, tol, step_factor, verbose);
         alpha_fit = out.alpha;
     } else {
         alpha_fit = alpha0;  // logistic ridge only (norm="2")
     }
-    VectorXd Y01(n);
-    for (int i = 0; i < n; ++i) Y01[i] = (Y_pm1[i] > 0.0) ? 1.0 : 0.0;
     VectorXd eta = Xtilde * alpha_fit;
     const double b0 = calibrate_logistic_intercept(Y01, eta);
-    const double risk = logistic_risk_pm1(Y_pm1, eta.array() + b0);
+    const double risk = logistic_risk_y01(Y01, eta.array() + b0);
     OptimizerOutput out;
     out.alpha = alpha_fit;
     out.alphaiters = MatrixXd::Zero(0, alpha_fit.size());
@@ -177,10 +192,21 @@ CVClassiOutput pcghal_cv_classi_python(const MatrixXd& X, const VectorXd& Y,
     const int n = X.rows();
     const int p = X.cols();
     if (Y.size() != n) throw std::runtime_error("pcghal_cv_classi: length(Y) != nrow(X)");
+    // Y must lie in [0,1]: hard {0,1} labels or soft EM-HAL posteriors. Soft
+    // labels (any value strictly inside (0,1)) are supported only for the
+    // logistic-ridge path (norm="2"); the PGD path (norm="sv", with_pgd=true)
+    // is not implemented for soft labels.
+    bool soft = false;
     for (int i = 0; i < n; ++i) {
-        if (Y[i] != 0.0 && Y[i] != 1.0) {
-            throw std::runtime_error("pcghal_cv_classi: Y must be 0/1");
+        if (Y[i] < -1e-12 || Y[i] > 1.0 + 1e-12) {
+            throw std::runtime_error("pcghal_cv_classi: Y must be in [0,1]");
         }
+        if (Y[i] > 1e-12 && Y[i] < 1.0 - 1e-12) soft = true;
+    }
+    if (soft && with_pgd) {
+        throw std::runtime_error(
+            "pcghal_cv_classi: soft labels (Y in (0,1)) are not implemented for "
+            "norm='sv'; use norm='1' or norm='2'.");
     }
     const int L = (int)lambdas.size();
     if (L <= 0) throw std::runtime_error("pcghal_cv_classi: lambdas must be non-empty");
@@ -198,9 +224,9 @@ CVClassiOutput pcghal_cv_classi_python(const MatrixXd& X, const VectorXd& Y,
     const int final_npc = compute_classi_design(X, maxdeg, npc_eff, center,
                                                  Xtilde, E_Nn, U_top, d_top);
-    // Y in {-1,+1} for the optimiser
-    VectorXd Y_pm1(n);
-    for (int i = 0; i < n; ++i) Y_pm1[i] = (Y[i] == 1.0) ? 1.0 : -1.0;
+    // Soft target in [0,1] used throughout (the ridge/CE machinery works
+    // directly in this space; the PGD branch builds {-1,+1} locally).
+    const VectorXd& Y01 = Y;
     // Degenerate case: R `hapc(family="binomial", …)` passes nfolds=1 with a
     // single λ — there is no proper train/test split.  Fit on full data and
@@ -213,7 +239,7 @@ CVClassiOutput pcghal_cv_classi_python(const MatrixXd& X, const VectorXd& Y,
         for (int j = 0; j < L; ++j) {
             const double lam = lambdas[j];
             OptimizerOutput full_out = logistic_full_fit(
-                Y_pm1, Xtilde, E_Nn, lam, max_iter, tol, step_factor,
+                Y01, Xtilde, E_Nn, lam, max_iter, tol, step_factor,
                 verbose, with_pgd);
             deviances[j] = full_out.risk;
             if (full_out.risk < best_val) {
@@ -265,19 +291,22 @@ CVClassiOutput pcghal_cv_classi_python(const MatrixXd& X, const VectorXd& Y,
             if (ntr == 0 || nte == 0) continue;
             MatrixXd Xtr(ntr, final_npc), Xte(nte, final_npc);
-            VectorXd Ytr_pm1(ntr), Yte01(nte);
+            VectorXd Ytr01(ntr), Yte01(nte);
             for (int i = 0; i < ntr; ++i) {
                 Xtr.row(i) = Xtilde.row(tr_idx[i]);
-                Ytr_pm1[i] = Y_pm1[tr_idx[i]];
+                Ytr01[i] = Y01[tr_idx[i]];
             }
             for (int i = 0; i < nte; ++i) {
                 Xte.row(i) = Xtilde.row(te_idx[i]);
-                Yte01[i] = Y[te_idx[i]];
+                Yte01[i] = Y01[te_idx[i]];
             }
-            VectorXd alpha0 = logistic_ridge_init(Ytr_pm1, Xtr, lambda);
+            VectorXd alpha0 = logistic_ridge_init_y01(Ytr01, Xtr, lambda);
             VectorXd alpha_fold;
             if (with_pgd) {
+                // Hard-label only path (soft labels rejected upstream).
+                VectorXd Ytr_pm1(ntr);
+                for (int i = 0; i < ntr; ++i) Ytr_pm1[i] = (Ytr01[i] > 0.5) ? 1.0 : -1.0;
                 OptimizerOutput out = pcghal_classi_call(Ytr_pm1, Xtr, E_Nn, alpha0,
                                                           max_iter, tol, step_factor,
                                                           verbose);
@@ -287,15 +316,13 @@ CVClassiOutput pcghal_cv_classi_python(const MatrixXd& X, const VectorXd& Y,
             }
             VectorXd eta_tr = Xtr * alpha_fold;
-            VectorXd Ytr01(ntr);
-            for (int i = 0; i < ntr; ++i) Ytr01[i] = (Ytr_pm1[i] > 0.0) ? 1.0 : 0.0;
             const double b0_fold = calibrate_logistic_intercept(Ytr01, eta_tr);
             VectorXd eta = (Xte * alpha_fold).array() + b0_fold;
             VectorXd probs = (1.0 + (-eta.array()).exp()).inverse();
             double dev = 0.0;
             for (int i = 0; i < nte; ++i) {
                 double pi = std::max(1e-15, std::min(1.0 - 1e-15, probs[i]));
-                dev += (Yte01[i] == 1.0) ? -std::log(pi) : -std::log(1.0 - pi);
+                dev += -(Yte01[i] * std::log(pi) + (1.0 - Yte01[i]) * std::log(1.0 - pi));
             }
             fold_error(k - 1, j) = dev / nte;
         }
@@ -325,7 +352,7 @@ CVClassiOutput pcghal_cv_classi_python(const MatrixXd& X, const VectorXd& Y,
     // Refit on full data at best_lambda (logistic ridge ± PGD).
     OptimizerOutput full_out = logistic_full_fit(
-        Y_pm1, Xtilde, E_Nn, best_lambda,
+        Y01, Xtilde, E_Nn, best_lambda,
         max_iter, tol, step_factor, verbose, with_pgd);
     // Predict on `predict_data` if supplied (else empty vector).

{hapc-2.1.0 → hapc-2.3.1}/src/r_bindings.cpp RENAMED Viewed

@@ -347,8 +347,11 @@ extern "C" SEXP single_pcghal_classi_ridge_call(SEXP X_, SEXP Y_, SEXP maxdeg_,
     if (Rf_length(Y_) != n) Rf_error("length(Y) must equal nrow(X).");
     Map<const MatrixXd> X(REAL(X_), n, p);
     Map<const VectorXd> Y01(REAL(Y_), n);
+    // Y must lie in [0,1]: hard {0,1} labels or soft EM-HAL posteriors. The
+    // logistic-ridge fit (norm="2") supports both.
     for (int i = 0; i < n; ++i) {
-        if (Y01[i] != 0.0 && Y01[i] != 1.0) Rf_error("Y must contain only 0 and 1");
+        if (Y01[i] < -1e-12 || Y01[i] > 1.0 + 1e-12)
+            Rf_error("Y must be in [0,1]");
     }
     int maxdeg = Rf_isInteger(maxdeg_) ? INTEGER(maxdeg_)[0] : (int)REAL(maxdeg_)[0];
     int npc = Rf_isInteger(npc_) ? INTEGER(npc_)[0] : (int)REAL(npc_)[0];
@@ -365,9 +368,6 @@ extern "C" SEXP single_pcghal_classi_ridge_call(SEXP X_, SEXP Y_, SEXP maxdeg_,
     const int final_npc = (int)des.d.size();
     MatrixXd Xtilde = des.U * des.d.asDiagonal();
-    VectorXd Y_pm1(n);
-    for (int i = 0; i < n; ++i) Y_pm1[i] = (Y01[i] == 1.0) ? 1.0 : -1.0;
     auto calibrate_b0 = [](const VectorXd& y01, const VectorXd& eta) {
         double b0 = 0.0;
         for (int it = 0; it < 50; ++it) {
@@ -381,16 +381,15 @@ extern "C" SEXP single_pcghal_classi_ridge_call(SEXP X_, SEXP Y_, SEXP maxdeg_,
         return b0;
     };
-    VectorXd alpha = logistic_ridge_init(Y_pm1, Xtilde, lambda);
+    VectorXd alpha = logistic_ridge_init_y01(Y01, Xtilde, lambda);
     VectorXd eta = Xtilde * alpha;
     const double b0 = calibrate_b0(Y01, eta);
+    // Soft cross-entropy risk (equals the {-1,+1} logistic risk on hard labels).
     double risk = 0.0;
     for (int i = 0; i < n; ++i) {
-        double ymu = Y_pm1[i] * (eta[i] + b0);
-        if (ymu > 0)
-            risk += std::log1p(std::exp(-ymu));
-        else
-            risk += -ymu + std::log1p(std::exp(ymu));
+        const double pi = 1.0 / (1.0 + std::exp(-(eta[i] + b0)));
+        const double pp = std::min(1.0 - 1e-15, std::max(1e-15, pi));
+        risk += -(Y01[i] * std::log(pp) + (1.0 - Y01[i]) * std::log(1.0 - pp));
     }
     risk /= n;

hapc-2.1.0/pyproject.toml DELETED Viewed

@@ -1,36 +0,0 @@
-[build-system]
-requires = ["setuptools>=65", "wheel", "cmake>=3.15", "pybind11>=2.6"]
-build-backend = "setuptools.build_meta"
-[project]
-name = "hapc"
-version = "2.1.0"
-description = "Highly Adaptive Principal Components"
-readme = "README.md"
-requires-python = ">=3.8"
-authors = [
-    {name = "Carlos García Meixide", email = "cgmeixide@gmail.com"}
-]
-license = {text = "MIT"}
-classifiers = [
-    "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.8",
-    "Programming Language :: Python :: 3.9",
-    "Programming Language :: Python :: 3.10",
-    "Programming Language :: Python :: 3.11",
-    "Programming Language :: Python :: 3.12",
-    "Operating System :: OS Independent",
-]
-dependencies = [
-    "numpy>=1.24,<2.3",
-    "scikit-learn>=1.0",
-]
-[project.optional-dependencies]
-dev = ["pytest", "pytest-cov", "black", "flake8"]
-[project.urls]
-Homepage = "https://github.com/meixide/hapc"
-Documentation = "https://github.com/meixide/hapc#readme"
-Repository = "https://github.com/meixide/hapc.git"
-Issues = "https://github.com/meixide/hapc/issues"