PyPI - oqboost - Versions diffs - 0.1.3__py3-none-any.whl - Mend

oqboost 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

oqboost/__init__.py +33 -0
oqboost/_classifier.py +499 -0
oqboost/_ext/liboqboost.dylib +0 -0
oqboost/_oqboost.py +470 -0
oqboost/_regressor.py +400 -0
oqboost-0.1.3.dist-info/METADATA +203 -0
oqboost-0.1.3.dist-info/RECORD +10 -0
oqboost-0.1.3.dist-info/WHEEL +5 -0
oqboost-0.1.3.dist-info/licenses/LICENSE +21 -0
oqboost-0.1.3.dist-info/top_level.txt +1 -0

oqboost/__init__.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""
+OQBoost — Oblique Gradient-Boosted Decision Trees.
+Gradient-boosted oblique decision trees where split directions are optimized
+by a C++ BFS engine with zero GPU-CPU sync overhead during training.
+Quickstart
+----------
+>>> from oqboost import OQBoostClassifier
+>>> clf = OQBoostClassifier(n_estimators=500, max_depth=6)
+>>> clf.fit(X_train, y_train, eval_set=[(X_val, y_val)])
+>>> clf.predict_proba(X_test)
+"""
+from ._classifier import OQBoostClassifier
+from ._regressor import OQBoostRegressor
+from ._oqboost import OQBoostTree
+def load_model(path: str) -> OQBoostClassifier | OQBoostRegressor:
+    """Load a model saved with ``clf.save(path)``."""
+    # joblib.load retrieves the actual pickled estimator type
+    import joblib
+    return joblib.load(path)
+__version__ = "0.1.3"
+__all__ = [
+    "OQBoostClassifier",
+    "OQBoostRegressor",
+    "OQBoostTree",
+    "load_model",
+]

oqboost/_classifier.py ADDED Viewed

@@ -0,0 +1,499 @@
+from __future__ import annotations
+import numpy as np
+from sklearn.base import BaseEstimator, ClassifierMixin
+from sklearn.utils.validation import check_is_fitted
+class OQBoostClassifier(BaseEstimator, ClassifierMixin):
+    """
+    OQBoost: gradient-boosted oblique decision trees.
+    Split directions are found by a C++ engine running a candidate tournament
+    per node (axis scan, inherited A/B/C mutations, pobs_sis orthogonal
+    blocks, direction cache) with lazy best-first tree growth and a
+    depth-adaptive candidate budget.
+    Two effective hyperparameters per tree (max_depth, reg_lambda).
+    Native missing-value handling (numeric NaN → mean imputation baked into
+    the binning context) and native categorical handling (per-round
+    gradient-rank target encoding; categories participate in both axis
+    scans and oblique projections).  Columns named in ``cat_features`` are
+    internally moved after the numeric block; category values must be
+    numeric IDs (floats holding integers).  NaN is allowed anywhere.
+    Parameters
+    ----------
+    n_estimators : int
+        Number of boosting rounds.
+    learning_rate : float
+        Shrinkage applied to each tree's leaf values.
+    max_depth : int
+        Maximum tree depth; leaf budget is 2^max_depth (depth=6 → 64 leaves,
+        matching XGBoost/CatBoost defaults).
+    reg_lambda : float
+        L2 regularisation on leaf weights (Newton step denominator).
+    subsample : float
+        Fraction of training samples used to build each tree.
+    early_stopping_rounds : int or None
+        Stop if validation loss does not improve for this many rounds.
+    random_state : int or None
+        Seed for reproducibility.
+    verbose : bool
+        Print per-round metrics during training.
+    cat_features : list of str or int, optional
+        Column names (if X is a DataFrame) or column indices treated as
+        categorical.  These features are excluded from numerical projections.
+    class_weight : str or None
+        "balanced" applies a prior-corrected argmax decision rule in
+        ``predict`` (training itself is always unweighted, so
+        ``predict_proba`` stays calibrated).  Improves balanced accuracy
+        under class imbalance with no log-loss cost.  Pass None for the
+        plain argmax rule.
+    prior_alpha : float
+        Strength of the prior correction used when ``class_weight ==
+        "balanced"``: predictions are ``argmax P / prior**alpha``.
+        0 = plain MAP rule (max raw accuracy), 1 = full correction
+        (max balanced accuracy).  The default 0.5 is the geometric-mean
+        compromise and typically maximises macro-F1.
+    inherited_rp_ratio : float
+        Fraction of split-direction candidates generated by parent weight
+        inheritance (vs fresh gradient-guided random projections).
+    mutation_rate : float
+        Base noise scale for axis-maintaining mutation (Strategy A);
+        decays with node depth as rate/sqrt(1+depth).
+    mutation_strength : float
+        Base weight for new-axis borrowing (Strategy B);
+        decays with node depth as strength/(1+depth).
+    pobs : bool
+        Inject 8 pobs_sis candidates (SIS-weighted support, exact
+        Haar-orthogonal block) into every node's tournament, carved from
+        the inherited budget.  Validated to improve logloss/AUC on real
+        benchmarks; set False to revert to the pure A/B/C pool.
+    """
+    def __init__(
+        self,
+        n_estimators:          int   = 1000,
+        learning_rate:         float = 0.03,
+        max_depth:             int   = 6,
+        reg_lambda:            float = 1.0,
+        subsample:             float = 0.8,
+        early_stopping_rounds: int | None = 50,
+        random_state:          int | None = None,
+        verbose:               bool  = False,
+        cat_features:          list | None = None,
+        class_weight:          str | None = None,
+        prior_alpha:           float = 0.5,
+        inherited_rp_ratio:    float = 1.0,
+        mutation_rate:         float = 0.1,
+        mutation_strength:     float = 0.5,
+        pobs:                  bool  = False,
+        goss:                  bool  = False,
+        goss_top_rate:         float = 0.2,
+        goss_other_rate:       float = 0.1,
+        reg_alpha:             float = 0.0,
+        gamma:                 float = 0.0,
+        min_child_weight:      float = 1.0,
+        max_leaves:            int | None = None,
+        max_bin:               int = 255,
+        colsample_bynode:      float = 1.0,
+        multi_strategy:        str = "shared",
+    ):
+        self.n_estimators          = n_estimators
+        self.learning_rate         = learning_rate
+        self.max_depth             = max_depth
+        self.reg_lambda            = reg_lambda
+        self.subsample             = subsample
+        self.early_stopping_rounds = early_stopping_rounds
+        self.random_state          = random_state
+        self.verbose               = verbose
+        self.cat_features          = cat_features
+        self.class_weight          = class_weight
+        self.prior_alpha           = prior_alpha
+        self.inherited_rp_ratio    = inherited_rp_ratio
+        self.mutation_rate         = mutation_rate
+        self.mutation_strength     = mutation_strength
+        self.pobs                  = pobs
+        self.goss                  = goss
+        self.goss_top_rate         = goss_top_rate
+        self.goss_other_rate       = goss_other_rate
+        self.reg_alpha             = reg_alpha
+        self.gamma                 = gamma
+        self.min_child_weight      = min_child_weight
+        self.max_leaves            = max_leaves
+        self.max_bin               = max_bin
+        self.colsample_bynode      = colsample_bynode
+        self.multi_strategy        = multi_strategy
+        self.feature_names_in_     = None
+    # ── public fit/predict ────────────────────────────────────────────────────
+    def _prepare_data(self, X, is_fit=False):
+        import pandas as pd
+        if not isinstance(X, pd.DataFrame):
+            if hasattr(X, "values"):
+                X = X.values
+            return np.asarray(X, dtype=np.float32)
+        X_prep = X.copy()
+        if is_fit:
+            self.feature_names_in_ = list(X.columns)
+            self._cat_mappings_ = {}
+        cat_cols = []
+        if self.cat_features:
+            for cf in self.cat_features:
+                if isinstance(cf, (int, np.integer)):
+                    if 0 <= cf < len(X.columns):
+                        cat_cols.append(X.columns[cf])
+                elif cf in X.columns:
+                    cat_cols.append(cf)
+        else:
+            for col in X.columns:
+                if isinstance(X[col].dtype, pd.CategoricalDtype) or X[col].dtype == object:
+                    cat_cols.append(col)
+        if is_fit and not self.cat_features and cat_cols:
+            self.cat_features = cat_cols
+        for col in cat_cols:
+            if not isinstance(X_prep[col].dtype, pd.CategoricalDtype):
+                X_prep[col] = X_prep[col].astype('category')
+            if is_fit:
+                self._cat_mappings_[col] = list(X_prep[col].cat.categories)
+            else:
+                if hasattr(self, "_cat_mappings_") and col in self._cat_mappings_:
+                    X_prep[col] = pd.Categorical(
+                        X_prep[col], categories=self._cat_mappings_[col]
+                    )
+            codes = X_prep[col].cat.codes.astype(np.float32)
+            codes[codes == -1] = np.nan
+            X_prep[col] = codes
+        return X_prep.values.astype(np.float32)
+    def fit(
+        self,
+        X,
+        y,
+        eval_set: list[tuple] | None = None,
+        sample_weight=None,
+    ) -> OQBoostClassifier:
+        """
+        Fit the classifier.
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+        y : array-like of shape (n_samples,)
+        eval_set : list of (X_val, y_val) tuples, optional
+            First tuple is used for early stopping and validation metrics.
+        sample_weight : array-like of shape (n_samples,), optional
+            Individual weights for each sample.
+        """
+        X = self._prepare_data(X, is_fit=True)
+        y = np.asarray(y, dtype=np.int64)
+        self.n_features_in_ = X.shape[1]
+        self.classes_       = np.unique(y)
+        if sample_weight is not None:
+            sample_weight = np.asarray(sample_weight, dtype=np.float32)
+            if sample_weight.ndim != 1 or len(sample_weight) != X.shape[0]:
+                raise ValueError("sample_weight must be a 1D array of length N.")
+        D_num = self._resolve_D_num(X.shape[1])
+        X_val, y_val = None, None
+        if eval_set:
+            X_val, y_val = eval_set[0]
+            X_val = self._prepare_data(X_val, is_fit=False)
+            y_val = np.asarray(y_val, dtype=np.int64)
+        self._fit_core(X, y, X_val, y_val, D_num, sample_weight=sample_weight)
+        return self
+    def predict(self, X) -> np.ndarray:
+        check_is_fitted(self, "trees_")
+        P = self.predict_proba(X)
+        # Balanced decision rule: argmax of prior-corrected probabilities
+        # P / prior^alpha.  Training stays unweighted (calibrated probas);
+        # only the decision threshold shifts.  alpha=1 maximises balanced
+        # accuracy, alpha=0 is the plain MAP rule; the default alpha=0.5
+        # (geometric-mean compromise) keeps most of the balanced-accuracy
+        # gain while staying within ~1.5 pp of the best raw accuracy and
+        # typically maximises macro-F1.
+        alpha = getattr(self, "prior_alpha", 0.5)
+        if (getattr(self, "class_weight", None) == "balanced"
+                and getattr(self, "_prior_", None) is not None
+                and alpha > 0.0):
+            prior = np.asarray(self._prior_, dtype=np.float32)[None, :]
+            P = P / np.power(prior, alpha)
+        return P.argmax(axis=1)
+    def predict_proba(self, X) -> np.ndarray:
+        if hasattr(self, "ovr_ensembles_") and self.ovr_ensembles_:
+            X_orig = X
+            N = X_orig.shape[0]
+            K = len(self.ovr_ensembles_)
+            logits = np.zeros((N, K), dtype=np.float32)
+            from ._oqboost import predict_ensemble
+            for c in range(K):
+                clf_c = self.ovr_ensembles_[c]
+                X_c = clf_c._prepare_data(X_orig, is_fit=False)
+                X_c = np.ascontiguousarray(X_c, dtype=np.float32)
+                if getattr(clf_c, "_col_perm_", None) is not None:
+                    X_c = np.ascontiguousarray(X_c[:, clf_c._col_perm_])
+                F = predict_ensemble(
+                    clf_c.trees_, X_c, 2, clf_c.learning_rate,
+                    np.array(clf_c.F_init_, dtype=np.float32)
+                )
+                logits[:, c] = F[:, 1] - F[:, 0]
+            logits_sh = logits - logits.max(axis=1, keepdims=True)
+            P = np.exp(logits_sh)
+            P /= P.sum(axis=1, keepdims=True)
+            return P
+        check_is_fitted(self, "trees_")
+        X = self._prepare_data(X, is_fit=False)
+        X = np.ascontiguousarray(X, dtype=np.float32)
+        if getattr(self, "_col_perm_", None) is not None:
+            X = np.ascontiguousarray(X[:, self._col_perm_])
+        N = X.shape[0]
+        from ._oqboost import predict_ensemble
+        K = len(self.F_init_)
+        F = predict_ensemble(self.trees_, X, K, self.learning_rate,
+                             np.array(self.F_init_, dtype=np.float32))
+        Fsh = F - F.max(axis=1, keepdims=True)
+        P   = np.exp(Fsh); P /= P.sum(axis=1, keepdims=True)
+        return P
+    # ── save / load ───────────────────────────────────────────────────────────
+    def save(self, path: str) -> None:
+        """Save the fitted model to disk."""
+        import joblib
+        joblib.dump(self, path, compress=3)
+    @classmethod
+    def load(cls, path: str) -> OQBoostClassifier:
+        """Load a model saved with :meth:`save`."""
+        import joblib
+        return joblib.load(path)
+    def get_n_trees(self) -> int:
+        """Return the number of trees actually fitted."""
+        check_is_fitted(self, "trees_")
+        if hasattr(self, "ovr_ensembles_") and self.ovr_ensembles_:
+            return sum(len(clf.trees_) for clf in self.ovr_ensembles_)
+        return len(self.trees_)
+    # ── internal ─────────────────────────────────────────────────────────────
+    def _resolve_cat_idx(self, D: int) -> list[int]:
+        """Sorted column indices declared categorical via ``cat_features``."""
+        if not self.cat_features:
+            return []
+        cat_idx = set()
+        for cf in self.cat_features:
+            if isinstance(cf, (int, np.integer)):
+                cat_idx.add(int(cf))
+            else:
+                names = getattr(self, "feature_names_in_", None)
+                if names is not None and cf in names:
+                    cat_idx.add(names.index(cf))
+        return sorted(cat_idx)
+    def _resolve_D_num(self, D: int) -> int:
+        """Number of numerical (non-categorical) columns."""
+        return D - len(self._resolve_cat_idx(D))
+    def _fit_core(self, X, y, X_val, y_val, D_num, sample_weight=None):
+        from ._oqboost import OQBoostContext
+        N, D = X.shape
+        K    = int(y.max()) + 1
+        seed = self.random_state if self.random_state is not None else 42
+        if K >= 3 and self.multi_strategy == "ovr":
+            # Train K separate binary classifiers
+            self.ovr_ensembles_ = []
+            self.trees_ = []  # Empty placeholder for check_is_fitted
+            self.F_init_ = [0.0] * K  # Dummy
+            cnt = np.bincount(y, minlength=K).astype(np.float32)
+            self._prior_ = (cnt / N).tolist()
+            self._col_perm_ = None
+            for c in range(K):
+                y_c = (y == c).astype(np.int64)
+                eval_set_c = None
+                if X_val is not None:
+                    y_val_c = (y_val == c).astype(np.int64)
+                    eval_set_c = [(X_val, y_val_c)]
+                clf_c = OQBoostClassifier(
+                    n_estimators=self.n_estimators,
+                    learning_rate=self.learning_rate,
+                    max_depth=self.max_depth,
+                    reg_lambda=self.reg_lambda,
+                    subsample=self.subsample,
+                    early_stopping_rounds=self.early_stopping_rounds,
+                    random_state=seed + c if seed is not None else None,
+                    verbose=self.verbose,
+                    cat_features=self.cat_features,
+                    class_weight=None,
+                    prior_alpha=self.prior_alpha,
+                    inherited_rp_ratio=self.inherited_rp_ratio,
+                    mutation_rate=self.mutation_rate,
+                    mutation_strength=self.mutation_strength,
+                    pobs=self.pobs,
+                    goss=self.goss,
+                    goss_top_rate=self.goss_top_rate,
+                    goss_other_rate=self.goss_other_rate,
+                    reg_alpha=self.reg_alpha,
+                    gamma=self.gamma,
+                    min_child_weight=self.min_child_weight,
+                    max_leaves=self.max_leaves,
+                    max_bin=self.max_bin,
+                    colsample_bynode=self.colsample_bynode,
+                    multi_strategy="shared"
+                )
+                if self.verbose:
+                    print(f"  [OQBoost OVR] Fitting class {c+1}/{K}...")
+                clf_c.fit(X, y_c, eval_set=eval_set_c, sample_weight=sample_weight)
+                self.ovr_ensembles_.append(clf_c)
+            return
+        cat_idx = self._resolve_cat_idx(D)
+        if cat_idx and cat_idx != list(range(D_num, D)):
+            perm = [i for i in range(D) if i not in set(cat_idx)] + cat_idx
+            self._col_perm_ = np.asarray(perm, dtype=np.intp)
+        else:
+            self._col_perm_ = None
+        if self._col_perm_ is not None:
+            X = np.ascontiguousarray(X[:, self._col_perm_])
+            if X_val is not None:
+                X_val = np.ascontiguousarray(X_val[:, self._col_perm_])
+        cnt = np.bincount(y, minlength=K).astype(np.float32)
+        self._prior_ = (cnt / N).tolist()
+        lp  = np.log(cnt / N + 1e-8).astype(np.float32); lp -= lp.mean()
+        self.F_init_ = lp.tolist()
+        Fsc   = np.tile(lp, (N, 1))
+        F_val = np.tile(lp, (X_val.shape[0], 1)) if X_val is not None else None
+        oh = np.zeros((N, K), dtype=np.float32)
+        oh[np.arange(N), y] = 1.0
+        rng = np.random.default_rng(seed)
+        best_val_loss = float("inf")
+        best_trees:   list = []
+        no_improv = 0
+        self.trees_: list = []
+        from ._oqboost import update_gradients, OQBoostContext
+        ctx = OQBoostContext(X, D_num=D_num, max_bin=self.max_bin)
+        G_w = np.empty((N, K), dtype=np.float32)
+        H_w = np.empty((N, K), dtype=np.float32)
+        full_idx = np.arange(N, dtype=np.int32)
+        try:
+            for m in range(self.n_estimators):
+                update_gradients(Fsc, oh, G_w, H_w)
+                if sample_weight is not None:
+                    G_w *= sample_weight[:, np.newaxis]
+                    H_w *= sample_weight[:, np.newaxis]
+                if self.goss:
+                    grad_norms = np.mean(np.abs(G_w), axis=1)
+                    top_n = int(self.goss_top_rate * N)
+                    other_n = int(self.goss_other_rate * (N - top_n))
+                    if top_n > 0 and other_n > 0 and (top_n + other_n) < N:
+                        k = N - top_n
+                        partitioned_idx = np.argpartition(grad_norms, k)
+                        top_idx = partitioned_idx[k:]
+                        remaining_idx = partitioned_idx[:k]
+                        random_idx = rng.choice(remaining_idx, size=other_n, replace=False)
+                        tree_sub = np.concatenate([top_idx, random_idx]).astype(np.int32)
+                        scale_factor = (1.0 - self.goss_top_rate) / self.goss_other_rate
+                        G_w[random_idx] *= scale_factor
+                        H_w[random_idx] *= scale_factor
+                    else:
+                        tree_sub = full_idx
+                elif self.subsample < 1.0:
+                    tree_sub = np.flatnonzero(
+                        rng.random(N) < self.subsample
+                    ).astype(np.int32)
+                    if len(tree_sub) < min(N, 1000):
+                        tree_sub = full_idx
+                else:
+                    tree_sub = full_idx
+                t, out_pred = ctx.build(
+                    G_w, H_w, tree_sub, self.max_depth, self.reg_lambda,
+                    inherited_rp_ratio=self.inherited_rp_ratio,
+                    mutation_rate=self.mutation_rate,
+                    mutation_strength=self.mutation_strength,
+                    seed=int(rng.integers(1 << 30)),
+                    pobs=getattr(self, "pobs", False),
+                    reg_alpha=self.reg_alpha,
+                    gamma=self.gamma,
+                    min_child_weight=self.min_child_weight,
+                    colsample_bynode=self.colsample_bynode,
+                    max_leaves=self.max_leaves if self.max_leaves is not None else (1 << self.max_depth),
+                )
+                self.trees_.append(t)
+                Fsc += self.learning_rate * out_pred
+                val_str = ""
+                if X_val is not None:
+                    pred_val = t.predict(X_val)
+                    F_val    = F_val + self.learning_rate * pred_val
+                    Fv_sh    = F_val - F_val.max(axis=1, keepdims=True)
+                    P_val    = np.exp(Fv_sh); P_val /= P_val.sum(axis=1, keepdims=True)
+                    val_loss = float(
+                        -np.log(P_val[np.arange(len(y_val)), y_val].clip(1e-8)).mean()
+                    )
+                    val_acc  = (P_val.argmax(axis=1) == y_val).mean()
+                    val_str  = f" | ValLoss={val_loss:.4f} | ValAcc={val_acc:.4f}"
+                    if val_loss < best_val_loss:
+                        best_val_loss = val_loss
+                        no_improv     = 0
+                        best_trees    = list(self.trees_)
+                    else:
+                        no_improv += 1
+                if self.verbose:
+                    Fsc_sh = Fsc - Fsc.max(axis=1, keepdims=True)
+                    Pm = np.exp(Fsc_sh)
+                    Pm /= Pm.sum(axis=1, keepdims=True)
+                    ll  = -np.log(Pm[np.arange(N), y].clip(1e-8)).mean()
+                    acc = (Pm.argmax(axis=1) == y).mean()
+                    print(
+                        f"  [OQBoost] Round {m+1:3d} | Loss={ll:.4f} | "
+                        f"Acc={acc:.4f}{val_str}"
+                    )
+                if X_val is not None and self.early_stopping_rounds is not None:
+                    if no_improv >= self.early_stopping_rounds:
+                        if self.verbose:
+                            print(f"  [OQBoost] Early stopping at round {m+1}")
+                        self.trees_ = best_trees
+                        break
+        finally:
+            ctx.close()

oqboost/_ext/liboqboost.dylib ADDED Viewed

Binary file