PyPI - hrboost - Versions diffs - 0.1.0__py3-none-any.whl - Mend

hrboost 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

hrboost/__init__.py +3 -0
hrboost/_lib.py +70 -0
hrboost/libhrboost.dylib +0 -0
hrboost/model.py +250 -0
hrboost-0.1.0.dist-info/METADATA +130 -0
hrboost-0.1.0.dist-info/RECORD +8 -0
hrboost-0.1.0.dist-info/WHEEL +5 -0
hrboost-0.1.0.dist-info/top_level.txt +1 -0

hrboost/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .model import HRBoostClassifier, HRBoostRegressor
+__all__ = ["HRBoostClassifier", "HRBoostRegressor"]

hrboost/_lib.py ADDED Viewed

@@ -0,0 +1,70 @@
+import ctypes
+import os
+import pathlib
+def _load():
+    here = pathlib.Path(__file__).resolve().parent
+    root = here.parent.parent
+    for name in ("libhrboost.dylib", "libhrboost.so"):
+        # Check inside the package directory (for installed package)
+        p_pkg = here / name
+        if p_pkg.exists():
+            return ctypes.CDLL(str(p_pkg))
+        # Check in the project root (for local development)
+        p_root = root / name
+        if p_root.exists():
+            return ctypes.CDLL(str(p_root))
+    raise FileNotFoundError(
+        "libhrboost not found — run `make` in the project root"
+    )
+_lib = _load()
+_lib.hrboost_create.restype  = ctypes.c_void_p
+_lib.hrboost_create.argtypes = []
+_lib.hrboost_free.restype  = None
+_lib.hrboost_free.argtypes = [ctypes.c_void_p]
+_lib.hrboost_fit.restype  = None
+_lib.hrboost_fit.argtypes = [
+    ctypes.c_void_p,                      # 1. model handle
+    ctypes.POINTER(ctypes.c_float),       # 2. X
+    ctypes.POINTER(ctypes.c_float),       # 3. y (float target)
+    ctypes.POINTER(ctypes.c_int),         # 4. cat_features ptr
+    ctypes.c_char_p,                      # 5. objective
+    ctypes.c_double,                      # 6. learning_rate
+    ctypes.c_double,                      # 7. reg_lambda
+    ctypes.c_double,                      # 8. subsample
+    ctypes.c_double,                      # 9. colsample_bytree
+    ctypes.c_double,                      # 10. min_child_weight
+    ctypes.c_double,                      # 11. gamma
+    ctypes.c_double,                      # 12. max_delta_step
+    ctypes.c_int,                         # 13. n
+    ctypes.c_int,                         # 14. D
+    ctypes.c_int,                         # 15. n_estimators
+    ctypes.c_int,                         # 16. max_depth
+    ctypes.c_int,                         # 17. max_leaves
+    ctypes.c_int,                         # 18. n_bins
+    ctypes.c_int,                         # 19. cat_features_len
+    ctypes.c_int,                         # 20. random_state
+    ctypes.c_int                          # 21. num_classes
+]
+_lib.hrboost_predict_proba.restype  = None
+_lib.hrboost_predict_proba.argtypes = [
+    ctypes.c_void_p,
+    ctypes.POINTER(ctypes.c_float),  # X
+    ctypes.c_int,                    # n
+    ctypes.c_int,                    # D
+    ctypes.POINTER(ctypes.c_double), # out_p
+]
+_lib.hrboost_predict.restype  = None
+_lib.hrboost_predict.argtypes = [
+    ctypes.c_void_p,
+    ctypes.POINTER(ctypes.c_float),  # X
+    ctypes.c_int,                    # n
+    ctypes.c_int,                    # D
+    ctypes.POINTER(ctypes.c_double), # out_y
+]

hrboost/libhrboost.dylib ADDED Viewed

Binary file

hrboost/model.py ADDED Viewed

@@ -0,0 +1,250 @@
+import os
+import ctypes
+import numpy as np
+from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
+from sklearn.utils.validation import check_is_fitted
+from ._lib import _lib
+class HRBoostClassifier(BaseEstimator, ClassifierMixin):
+    def __init__(
+        self,
+        n_estimators=200,
+        learning_rate=0.1,
+        max_depth=4,
+        max_leaves=64,
+        reg_lambda=1.0,
+        subsample=0.8,
+        colsample_bytree=1.0,
+        n_bins=32,
+        min_child_weight=0.1,
+        gamma=0.0,
+        max_delta_step=0.0,
+        cat_features=None,
+        random_state=0,
+        objective="binary",
+        num_classes=None,
+        verbose=True,
+    ):
+        self.n_estimators = n_estimators
+        self.learning_rate = learning_rate
+        self.max_depth = max_depth
+        self.max_leaves = max_leaves
+        self.reg_lambda = reg_lambda
+        self.subsample = subsample
+        self.colsample_bytree = colsample_bytree
+        self.n_bins = n_bins
+        self.min_child_weight = min_child_weight
+        self.gamma = gamma
+        self.max_delta_step = max_delta_step
+        self.cat_features = cat_features
+        self.random_state = random_state
+        self.objective = objective
+        self.num_classes = num_classes
+        self.verbose = verbose
+    def fit(self, X, y):
+        X = np.ascontiguousarray(X, dtype=np.float32)
+        y_orig = np.ascontiguousarray(y)
+        self.classes_ = np.unique(y_orig)
+        n_classes = len(self.classes_)
+        if self.objective == "binary" or n_classes <= 2:
+            self.objective_ = "binary"
+            self.num_classes_ = 1
+        else:
+            self.objective_ = "multiclass"
+            self.num_classes_ = self.num_classes if self.num_classes is not None else n_classes
+        # Target must be float32 for C++ HRBoost fit
+        y = y_orig.astype(np.float32, copy=False)
+        n, D = X.shape
+        self.n_features_in_ = D
+        self._handle = _lib.hrboost_create()
+        cat_list = self.cat_features if self.cat_features is not None else []
+        cats = np.asarray(cat_list, dtype=np.int32)
+        cat_ptr = cats.ctypes.data_as(ctypes.POINTER(ctypes.c_int))
+        obj_bytes = self.objective_.encode("utf-8")
+        # Set environment variables for C++ logging control
+        old_verbose = os.environ.get("HRBOOST_VERBOSE", None)
+        os.environ["HRBOOST_VERBOSE"] = "1" if self.verbose else "0"
+        try:
+            _lib.hrboost_fit(
+                self._handle,
+                X.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
+                y.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
+                cat_ptr,
+                ctypes.c_char_p(obj_bytes),
+                ctypes.c_double(self.learning_rate),
+                ctypes.c_double(self.reg_lambda),
+                ctypes.c_double(self.subsample),
+                ctypes.c_double(self.colsample_bytree),
+                ctypes.c_double(self.min_child_weight),
+                ctypes.c_double(self.gamma),
+                ctypes.c_double(self.max_delta_step),
+                ctypes.c_int(n),
+                ctypes.c_int(D),
+                ctypes.c_int(self.n_estimators),
+                ctypes.c_int(self.max_depth),
+                ctypes.c_int(self.max_leaves),
+                ctypes.c_int(self.n_bins),
+                ctypes.c_int(len(cats)),
+                ctypes.c_int(self.random_state),
+                ctypes.c_int(self.num_classes_)
+            )
+        finally:
+            if old_verbose is not None:
+                os.environ["HRBOOST_VERBOSE"] = old_verbose
+            elif "HRBOOST_VERBOSE" in os.environ:
+                del os.environ["HRBOOST_VERBOSE"]
+        return self
+    def predict_proba(self, X):
+        check_is_fitted(self)
+        X = np.ascontiguousarray(X, dtype=np.float32)
+        n, D = X.shape
+        if self.objective_ == "binary":
+            out = np.empty(n, dtype=np.float64)
+            _lib.hrboost_predict_proba(
+                self._handle,
+                X.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
+                ctypes.c_int(n),
+                ctypes.c_int(D),
+                out.ctypes.data_as(ctypes.POINTER(ctypes.c_double)),
+            )
+            return np.column_stack([1.0 - out, out])
+        else:
+            out = np.empty(n * self.num_classes_, dtype=np.float64)
+            _lib.hrboost_predict_proba(
+                self._handle,
+                X.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
+                ctypes.c_int(n),
+                ctypes.c_int(D),
+                out.ctypes.data_as(ctypes.POINTER(ctypes.c_double)),
+            )
+            return out.reshape(n, self.num_classes_)
+    def predict(self, X):
+        check_is_fitted(self)
+        proba = self.predict_proba(X)
+        if self.objective_ == "binary":
+            return (proba[:, 1] >= 0.5).astype(int)
+        else:
+            return np.argmax(proba, axis=1)
+    def __del__(self):
+        if hasattr(self, "_handle") and self._handle:
+            _lib.hrboost_free(self._handle)
+            self._handle = None
+class HRBoostRegressor(BaseEstimator, RegressorMixin):
+    def __init__(
+        self,
+        n_estimators=200,
+        learning_rate=0.1,
+        max_depth=4,
+        max_leaves=64,
+        reg_lambda=1.0,
+        subsample=0.8,
+        colsample_bytree=1.0,
+        n_bins=32,
+        min_child_weight=0.1,
+        gamma=0.0,
+        max_delta_step=0.0,
+        cat_features=None,
+        random_state=0,
+        verbose=True,
+    ):
+        self.n_estimators = n_estimators
+        self.learning_rate = learning_rate
+        self.max_depth = max_depth
+        self.max_leaves = max_leaves
+        self.reg_lambda = reg_lambda
+        self.subsample = subsample
+        self.colsample_bytree = colsample_bytree
+        self.n_bins = n_bins
+        self.min_child_weight = min_child_weight
+        self.gamma = gamma
+        self.max_delta_step = max_delta_step
+        self.cat_features = cat_features
+        self.random_state = random_state
+        self.verbose = verbose
+    def fit(self, X, y):
+        X = np.ascontiguousarray(X, dtype=np.float32)
+        y = np.ascontiguousarray(y, dtype=np.float32)
+        n, D = X.shape
+        self.objective_ = "regression"
+        self.num_classes_ = 1
+        self.n_features_in_ = D
+        self._handle = _lib.hrboost_create()
+        cat_list = self.cat_features if self.cat_features is not None else []
+        cats = np.asarray(cat_list, dtype=np.int32)
+        cat_ptr = cats.ctypes.data_as(ctypes.POINTER(ctypes.c_int))
+        obj_bytes = self.objective_.encode("utf-8")
+        # Set environment variables for C++ logging control
+        old_verbose = os.environ.get("HRBOOST_VERBOSE", None)
+        os.environ["HRBOOST_VERBOSE"] = "1" if self.verbose else "0"
+        try:
+            _lib.hrboost_fit(
+                self._handle,
+                X.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
+                y.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
+                cat_ptr,
+                ctypes.c_char_p(obj_bytes),
+                ctypes.c_double(self.learning_rate),
+                ctypes.c_double(self.reg_lambda),
+                ctypes.c_double(self.subsample),
+                ctypes.c_double(self.colsample_bytree),
+                ctypes.c_double(self.min_child_weight),
+                ctypes.c_double(self.gamma),
+                ctypes.c_double(self.max_delta_step),
+                ctypes.c_int(n),
+                ctypes.c_int(D),
+                ctypes.c_int(self.n_estimators),
+                ctypes.c_int(self.max_depth),
+                ctypes.c_int(self.max_leaves),
+                ctypes.c_int(self.n_bins),
+                ctypes.c_int(len(cats)),
+                ctypes.c_int(self.random_state),
+                ctypes.c_int(self.num_classes_)
+            )
+        finally:
+            if old_verbose is not None:
+                os.environ["HRBOOST_VERBOSE"] = old_verbose
+            elif "HRBOOST_VERBOSE" in os.environ:
+                del os.environ["HRBOOST_VERBOSE"]
+        return self
+    def predict(self, X):
+        check_is_fitted(self)
+        X = np.ascontiguousarray(X, dtype=np.float32)
+        n, D = X.shape
+        out = np.empty(n, dtype=np.float64)
+        _lib.hrboost_predict(
+            self._handle,
+            X.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
+            ctypes.c_int(n),
+            ctypes.c_int(D),
+            out.ctypes.data_as(ctypes.POINTER(ctypes.c_double)),
+        )
+        return out
+    def __del__(self):
+        if hasattr(self, "_handle") and self._handle:
+            _lib.hrboost_free(self._handle)
+            self._handle = None

hrboost-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,130 @@
+Metadata-Version: 2.4
+Name: hrboost
+Version: 0.1.0
+Summary: HRBoost: Hierarchical Refined Boost - GBDT with Non-monotonic Bayesian Hierarchical Clustering
+License: MIT
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+Requires-Dist: numpy>=1.24
+Requires-Dist: scikit-learn>=1.3
+Provides-Extra: dev
+Requires-Dist: matplotlib>=3.7; extra == "dev"
+Requires-Dist: pandas>=2.0; extra == "dev"
+Requires-Dist: lightgbm; extra == "dev"
+Requires-Dist: xgboost; extra == "dev"
+Requires-Dist: catboost; extra == "dev"
+Requires-Dist: twine; extra == "dev"
+Requires-Dist: build; extra == "dev"
+# HRBoost (Hierarchical Refined Boost)
+HRBoost is a fast, lightweight Gradient Boosting Decision Tree (GBDT) library built in C++ and Python. It introduces a **Non-monotonic Bayesian Hierarchical Clustering (LNM-BHC, $k=3$)** algorithm inside its core engine to find optimal splits for high-cardinality categorical variables with zero manual parameter tuning.
+It is designed to be 100% compliant with the `scikit-learn` API, offering both `HRBoostClassifier` and `HRBoostRegressor`.
+---
+## Key Features
+- **Optimal Categorical Splitting (LNM-BHC)**: Implements non-monotonic Bayesian Hierarchical Clustering to capture categorical structure under noise without sorting artifacts.
+- **Zero-Parameter Diet**: Slimmed-down hyperparameter interface where BHC regularization uses a robust fixed sliding window size $k=3$ and falls back to `reg_lambda`.
+- **Scikit-Learn Compliant**: Direct replacement for `LGBMClassifier/Regressor` or `XGBClassifier/Regressor` in python pipelines.
+- **COHESION_REG Tuning**: Keep control of dynamic regularization sensitivity via the `COHESION_REG` environment variable (default: `0.3`).
+---
+## Installation
+### From PyPI
+```bash
+pip install hrboost
+```
+### From Source
+Ensure you have a C++ compiler supporting C++17.
+```bash
+git clone https://github.com/yourusername/hrboost.git
+cd hrboost
+sh build.sh
+pip install -e .
+```
+---
+## Quick Start
+### 1. Classification (`HRBoostClassifier`)
+`HRBoostClassifier` supports binary and multiclass tasks natively.
+```python
+import numpy as np
+from sklearn.datasets import load_digits
+from sklearn.model_selection import train_test_split
+from hrboost import HRBoostClassifier
+# Load digits dataset (10 classes)
+digits = load_digits()
+X_train, X_test, y_train, y_test = train_test_split(
+    digits.data, digits.target, test_size=0.2, random_state=42
+)
+# Initialize & fit
+clf = HRBoostClassifier(
+    n_estimators=100,
+    learning_rate=0.1,
+    max_depth=4,
+    random_state=42,
+    objective="multiclass"
+)
+clf.fit(X_train, y_train)
+# Predict probabilities and classes
+probs = clf.predict_proba(X_test)
+preds = clf.predict(X_test)
+accuracy = np.mean(preds == y_test)
+print(f"Accuracy: {accuracy:.4f}")
+```
+### 2. Regression (`HRBoostRegressor`)
+`HRBoostRegressor` models continuous target values with Mean Squared Error (MSE) objective.
+```python
+from sklearn.datasets import load_diabetes
+from sklearn.metrics import mean_squared_error
+from hrboost import HRBoostRegressor
+# Load diabetes dataset
+diabetes = load_diabetes()
+X_train, X_test, y_train, y_test = train_test_split(
+    diabetes.data, diabetes.target, test_size=0.2, random_state=42
+)
+# Initialize & fit
+reg = HRBoostRegressor(
+    n_estimators=150,
+    learning_rate=0.08,
+    max_depth=4,
+    random_state=42
+)
+reg.fit(X_train, y_train)
+# Predict
+preds = reg.predict(X_test)
+mse = mean_squared_error(y_test, preds)
+print(f"MSE: {mse:.4f}")
+```
+### 3. Dynamic Regularization Sensitivity (`COHESION_REG`)
+You can tune BHC's dynamic regularization cohesion penalty via the environment variable:
+```bash
+export COHESION_REG=0.5
+python your_script.py
+```
+---
+## License
+This project is licensed under the MIT License.

hrboost-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+hrboost/__init__.py,sha256=gZPju5sozMKu6MSBkwREw-wDMYSnQHKeOsLUDPbnbLY,108
+hrboost/_lib.py,sha256=lYaf5cdRSX2wInSBETGIxGqEm_SVmwzK9GKue1siTak,2637
+hrboost/libhrboost.dylib,sha256=NpAUEqqZsxB2E4nf9vecIDaX4uWpvc-V5BQTG1z_Hf0,116032
+hrboost/model.py,sha256=x8rTnJtwhiHIB4PsvpTBkxXrYvO9adJITLdXeeTXAiM,8708
+hrboost-0.1.0.dist-info/METADATA,sha256=bccFjAxc_HOmgimZcs84O8_OX1SJuSlB_HBLIhLmqTQ,3771
+hrboost-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
+hrboost-0.1.0.dist-info/top_level.txt,sha256=Fpt5lBH26NdndNLH02GfemBIzBKqu0f_Fo9ifpDFOGo,8
+hrboost-0.1.0.dist-info/RECORD,,

hrboost-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (82.0.1)
+Root-Is-Purelib: true
+Tag: py3-none-any

hrboost-0.1.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ hrboost