hrboost 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hrboost/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ from .model import HRBoostClassifier, HRBoostRegressor
2
+
3
+ __all__ = ["HRBoostClassifier", "HRBoostRegressor"]
hrboost/_lib.py ADDED
@@ -0,0 +1,70 @@
1
+ import ctypes
2
+ import os
3
+ import pathlib
4
+
5
+ def _load():
6
+ here = pathlib.Path(__file__).resolve().parent
7
+ root = here.parent.parent
8
+ for name in ("libhrboost.dylib", "libhrboost.so"):
9
+ # Check inside the package directory (for installed package)
10
+ p_pkg = here / name
11
+ if p_pkg.exists():
12
+ return ctypes.CDLL(str(p_pkg))
13
+ # Check in the project root (for local development)
14
+ p_root = root / name
15
+ if p_root.exists():
16
+ return ctypes.CDLL(str(p_root))
17
+ raise FileNotFoundError(
18
+ "libhrboost not found — run `make` in the project root"
19
+ )
20
+
21
+ _lib = _load()
22
+
23
+ _lib.hrboost_create.restype = ctypes.c_void_p
24
+ _lib.hrboost_create.argtypes = []
25
+
26
+ _lib.hrboost_free.restype = None
27
+ _lib.hrboost_free.argtypes = [ctypes.c_void_p]
28
+
29
+ _lib.hrboost_fit.restype = None
30
+ _lib.hrboost_fit.argtypes = [
31
+ ctypes.c_void_p, # 1. model handle
32
+ ctypes.POINTER(ctypes.c_float), # 2. X
33
+ ctypes.POINTER(ctypes.c_float), # 3. y (float target)
34
+ ctypes.POINTER(ctypes.c_int), # 4. cat_features ptr
35
+ ctypes.c_char_p, # 5. objective
36
+ ctypes.c_double, # 6. learning_rate
37
+ ctypes.c_double, # 7. reg_lambda
38
+ ctypes.c_double, # 8. subsample
39
+ ctypes.c_double, # 9. colsample_bytree
40
+ ctypes.c_double, # 10. min_child_weight
41
+ ctypes.c_double, # 11. gamma
42
+ ctypes.c_double, # 12. max_delta_step
43
+ ctypes.c_int, # 13. n
44
+ ctypes.c_int, # 14. D
45
+ ctypes.c_int, # 15. n_estimators
46
+ ctypes.c_int, # 16. max_depth
47
+ ctypes.c_int, # 17. max_leaves
48
+ ctypes.c_int, # 18. n_bins
49
+ ctypes.c_int, # 19. cat_features_len
50
+ ctypes.c_int, # 20. random_state
51
+ ctypes.c_int # 21. num_classes
52
+ ]
53
+
54
+ _lib.hrboost_predict_proba.restype = None
55
+ _lib.hrboost_predict_proba.argtypes = [
56
+ ctypes.c_void_p,
57
+ ctypes.POINTER(ctypes.c_float), # X
58
+ ctypes.c_int, # n
59
+ ctypes.c_int, # D
60
+ ctypes.POINTER(ctypes.c_double), # out_p
61
+ ]
62
+
63
+ _lib.hrboost_predict.restype = None
64
+ _lib.hrboost_predict.argtypes = [
65
+ ctypes.c_void_p,
66
+ ctypes.POINTER(ctypes.c_float), # X
67
+ ctypes.c_int, # n
68
+ ctypes.c_int, # D
69
+ ctypes.POINTER(ctypes.c_double), # out_y
70
+ ]
Binary file
hrboost/model.py ADDED
@@ -0,0 +1,250 @@
1
+ import os
2
+ import ctypes
3
+ import numpy as np
4
+ from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
5
+ from sklearn.utils.validation import check_is_fitted
6
+ from ._lib import _lib
7
+
8
+ class HRBoostClassifier(BaseEstimator, ClassifierMixin):
9
+ def __init__(
10
+ self,
11
+ n_estimators=200,
12
+ learning_rate=0.1,
13
+ max_depth=4,
14
+ max_leaves=64,
15
+ reg_lambda=1.0,
16
+ subsample=0.8,
17
+ colsample_bytree=1.0,
18
+ n_bins=32,
19
+ min_child_weight=0.1,
20
+ gamma=0.0,
21
+ max_delta_step=0.0,
22
+ cat_features=None,
23
+ random_state=0,
24
+ objective="binary",
25
+ num_classes=None,
26
+ verbose=True,
27
+ ):
28
+ self.n_estimators = n_estimators
29
+ self.learning_rate = learning_rate
30
+ self.max_depth = max_depth
31
+ self.max_leaves = max_leaves
32
+ self.reg_lambda = reg_lambda
33
+ self.subsample = subsample
34
+ self.colsample_bytree = colsample_bytree
35
+ self.n_bins = n_bins
36
+ self.min_child_weight = min_child_weight
37
+ self.gamma = gamma
38
+ self.max_delta_step = max_delta_step
39
+ self.cat_features = cat_features
40
+ self.random_state = random_state
41
+ self.objective = objective
42
+ self.num_classes = num_classes
43
+ self.verbose = verbose
44
+
45
+ def fit(self, X, y):
46
+ X = np.ascontiguousarray(X, dtype=np.float32)
47
+ y_orig = np.ascontiguousarray(y)
48
+ self.classes_ = np.unique(y_orig)
49
+ n_classes = len(self.classes_)
50
+
51
+ if self.objective == "binary" or n_classes <= 2:
52
+ self.objective_ = "binary"
53
+ self.num_classes_ = 1
54
+ else:
55
+ self.objective_ = "multiclass"
56
+ self.num_classes_ = self.num_classes if self.num_classes is not None else n_classes
57
+
58
+ # Target must be float32 for C++ HRBoost fit
59
+ y = y_orig.astype(np.float32, copy=False)
60
+ n, D = X.shape
61
+
62
+ self.n_features_in_ = D
63
+ self._handle = _lib.hrboost_create()
64
+
65
+ cat_list = self.cat_features if self.cat_features is not None else []
66
+ cats = np.asarray(cat_list, dtype=np.int32)
67
+ cat_ptr = cats.ctypes.data_as(ctypes.POINTER(ctypes.c_int))
68
+
69
+ obj_bytes = self.objective_.encode("utf-8")
70
+
71
+ # Set environment variables for C++ logging control
72
+ old_verbose = os.environ.get("HRBOOST_VERBOSE", None)
73
+ os.environ["HRBOOST_VERBOSE"] = "1" if self.verbose else "0"
74
+
75
+ try:
76
+ _lib.hrboost_fit(
77
+ self._handle,
78
+ X.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
79
+ y.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
80
+ cat_ptr,
81
+ ctypes.c_char_p(obj_bytes),
82
+ ctypes.c_double(self.learning_rate),
83
+ ctypes.c_double(self.reg_lambda),
84
+ ctypes.c_double(self.subsample),
85
+ ctypes.c_double(self.colsample_bytree),
86
+ ctypes.c_double(self.min_child_weight),
87
+ ctypes.c_double(self.gamma),
88
+ ctypes.c_double(self.max_delta_step),
89
+ ctypes.c_int(n),
90
+ ctypes.c_int(D),
91
+ ctypes.c_int(self.n_estimators),
92
+ ctypes.c_int(self.max_depth),
93
+ ctypes.c_int(self.max_leaves),
94
+ ctypes.c_int(self.n_bins),
95
+ ctypes.c_int(len(cats)),
96
+ ctypes.c_int(self.random_state),
97
+ ctypes.c_int(self.num_classes_)
98
+ )
99
+ finally:
100
+ if old_verbose is not None:
101
+ os.environ["HRBOOST_VERBOSE"] = old_verbose
102
+ elif "HRBOOST_VERBOSE" in os.environ:
103
+ del os.environ["HRBOOST_VERBOSE"]
104
+
105
+ return self
106
+
107
+ def predict_proba(self, X):
108
+ check_is_fitted(self)
109
+ X = np.ascontiguousarray(X, dtype=np.float32)
110
+ n, D = X.shape
111
+
112
+ if self.objective_ == "binary":
113
+ out = np.empty(n, dtype=np.float64)
114
+ _lib.hrboost_predict_proba(
115
+ self._handle,
116
+ X.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
117
+ ctypes.c_int(n),
118
+ ctypes.c_int(D),
119
+ out.ctypes.data_as(ctypes.POINTER(ctypes.c_double)),
120
+ )
121
+ return np.column_stack([1.0 - out, out])
122
+ else:
123
+ out = np.empty(n * self.num_classes_, dtype=np.float64)
124
+ _lib.hrboost_predict_proba(
125
+ self._handle,
126
+ X.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
127
+ ctypes.c_int(n),
128
+ ctypes.c_int(D),
129
+ out.ctypes.data_as(ctypes.POINTER(ctypes.c_double)),
130
+ )
131
+ return out.reshape(n, self.num_classes_)
132
+
133
+ def predict(self, X):
134
+ check_is_fitted(self)
135
+ proba = self.predict_proba(X)
136
+ if self.objective_ == "binary":
137
+ return (proba[:, 1] >= 0.5).astype(int)
138
+ else:
139
+ return np.argmax(proba, axis=1)
140
+
141
+ def __del__(self):
142
+ if hasattr(self, "_handle") and self._handle:
143
+ _lib.hrboost_free(self._handle)
144
+ self._handle = None
145
+
146
+
147
+ class HRBoostRegressor(BaseEstimator, RegressorMixin):
148
+ def __init__(
149
+ self,
150
+ n_estimators=200,
151
+ learning_rate=0.1,
152
+ max_depth=4,
153
+ max_leaves=64,
154
+ reg_lambda=1.0,
155
+ subsample=0.8,
156
+ colsample_bytree=1.0,
157
+ n_bins=32,
158
+ min_child_weight=0.1,
159
+ gamma=0.0,
160
+ max_delta_step=0.0,
161
+ cat_features=None,
162
+ random_state=0,
163
+ verbose=True,
164
+ ):
165
+ self.n_estimators = n_estimators
166
+ self.learning_rate = learning_rate
167
+ self.max_depth = max_depth
168
+ self.max_leaves = max_leaves
169
+ self.reg_lambda = reg_lambda
170
+ self.subsample = subsample
171
+ self.colsample_bytree = colsample_bytree
172
+ self.n_bins = n_bins
173
+ self.min_child_weight = min_child_weight
174
+ self.gamma = gamma
175
+ self.max_delta_step = max_delta_step
176
+ self.cat_features = cat_features
177
+ self.random_state = random_state
178
+ self.verbose = verbose
179
+
180
+ def fit(self, X, y):
181
+ X = np.ascontiguousarray(X, dtype=np.float32)
182
+ y = np.ascontiguousarray(y, dtype=np.float32)
183
+ n, D = X.shape
184
+
185
+ self.objective_ = "regression"
186
+ self.num_classes_ = 1
187
+ self.n_features_in_ = D
188
+ self._handle = _lib.hrboost_create()
189
+
190
+ cat_list = self.cat_features if self.cat_features is not None else []
191
+ cats = np.asarray(cat_list, dtype=np.int32)
192
+ cat_ptr = cats.ctypes.data_as(ctypes.POINTER(ctypes.c_int))
193
+
194
+ obj_bytes = self.objective_.encode("utf-8")
195
+
196
+ # Set environment variables for C++ logging control
197
+ old_verbose = os.environ.get("HRBOOST_VERBOSE", None)
198
+ os.environ["HRBOOST_VERBOSE"] = "1" if self.verbose else "0"
199
+
200
+ try:
201
+ _lib.hrboost_fit(
202
+ self._handle,
203
+ X.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
204
+ y.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
205
+ cat_ptr,
206
+ ctypes.c_char_p(obj_bytes),
207
+ ctypes.c_double(self.learning_rate),
208
+ ctypes.c_double(self.reg_lambda),
209
+ ctypes.c_double(self.subsample),
210
+ ctypes.c_double(self.colsample_bytree),
211
+ ctypes.c_double(self.min_child_weight),
212
+ ctypes.c_double(self.gamma),
213
+ ctypes.c_double(self.max_delta_step),
214
+ ctypes.c_int(n),
215
+ ctypes.c_int(D),
216
+ ctypes.c_int(self.n_estimators),
217
+ ctypes.c_int(self.max_depth),
218
+ ctypes.c_int(self.max_leaves),
219
+ ctypes.c_int(self.n_bins),
220
+ ctypes.c_int(len(cats)),
221
+ ctypes.c_int(self.random_state),
222
+ ctypes.c_int(self.num_classes_)
223
+ )
224
+ finally:
225
+ if old_verbose is not None:
226
+ os.environ["HRBOOST_VERBOSE"] = old_verbose
227
+ elif "HRBOOST_VERBOSE" in os.environ:
228
+ del os.environ["HRBOOST_VERBOSE"]
229
+
230
+ return self
231
+
232
+ def predict(self, X):
233
+ check_is_fitted(self)
234
+ X = np.ascontiguousarray(X, dtype=np.float32)
235
+ n, D = X.shape
236
+
237
+ out = np.empty(n, dtype=np.float64)
238
+ _lib.hrboost_predict(
239
+ self._handle,
240
+ X.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
241
+ ctypes.c_int(n),
242
+ ctypes.c_int(D),
243
+ out.ctypes.data_as(ctypes.POINTER(ctypes.c_double)),
244
+ )
245
+ return out
246
+
247
+ def __del__(self):
248
+ if hasattr(self, "_handle") and self._handle:
249
+ _lib.hrboost_free(self._handle)
250
+ self._handle = None
@@ -0,0 +1,130 @@
1
+ Metadata-Version: 2.4
2
+ Name: hrboost
3
+ Version: 0.1.0
4
+ Summary: HRBoost: Hierarchical Refined Boost - GBDT with Non-monotonic Bayesian Hierarchical Clustering
5
+ License: MIT
6
+ Requires-Python: >=3.10
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: numpy>=1.24
9
+ Requires-Dist: scikit-learn>=1.3
10
+ Provides-Extra: dev
11
+ Requires-Dist: matplotlib>=3.7; extra == "dev"
12
+ Requires-Dist: pandas>=2.0; extra == "dev"
13
+ Requires-Dist: lightgbm; extra == "dev"
14
+ Requires-Dist: xgboost; extra == "dev"
15
+ Requires-Dist: catboost; extra == "dev"
16
+ Requires-Dist: twine; extra == "dev"
17
+ Requires-Dist: build; extra == "dev"
18
+
19
+ # HRBoost (Hierarchical Refined Boost)
20
+
21
+ HRBoost is a fast, lightweight Gradient Boosting Decision Tree (GBDT) library built in C++ and Python. It introduces a **Non-monotonic Bayesian Hierarchical Clustering (LNM-BHC, $k=3$)** algorithm inside its core engine to find optimal splits for high-cardinality categorical variables with zero manual parameter tuning.
22
+
23
+ It is designed to be 100% compliant with the `scikit-learn` API, offering both `HRBoostClassifier` and `HRBoostRegressor`.
24
+
25
+ ---
26
+
27
+ ## Key Features
28
+
29
+ - **Optimal Categorical Splitting (LNM-BHC)**: Implements non-monotonic Bayesian Hierarchical Clustering to capture categorical structure under noise without sorting artifacts.
30
+ - **Zero-Parameter Diet**: Slimmed-down hyperparameter interface where BHC regularization uses a robust fixed sliding window size $k=3$ and falls back to `reg_lambda`.
31
+ - **Scikit-Learn Compliant**: Direct replacement for `LGBMClassifier/Regressor` or `XGBClassifier/Regressor` in python pipelines.
32
+ - **COHESION_REG Tuning**: Keep control of dynamic regularization sensitivity via the `COHESION_REG` environment variable (default: `0.3`).
33
+
34
+ ---
35
+
36
+ ## Installation
37
+
38
+ ### From PyPI
39
+ ```bash
40
+ pip install hrboost
41
+ ```
42
+
43
+ ### From Source
44
+ Ensure you have a C++ compiler supporting C++17.
45
+ ```bash
46
+ git clone https://github.com/yourusername/hrboost.git
47
+ cd hrboost
48
+ sh build.sh
49
+ pip install -e .
50
+ ```
51
+
52
+ ---
53
+
54
+ ## Quick Start
55
+
56
+ ### 1. Classification (`HRBoostClassifier`)
57
+ `HRBoostClassifier` supports binary and multiclass tasks natively.
58
+
59
+ ```python
60
+ import numpy as np
61
+ from sklearn.datasets import load_digits
62
+ from sklearn.model_selection import train_test_split
63
+ from hrboost import HRBoostClassifier
64
+
65
+ # Load digits dataset (10 classes)
66
+ digits = load_digits()
67
+ X_train, X_test, y_train, y_test = train_test_split(
68
+ digits.data, digits.target, test_size=0.2, random_state=42
69
+ )
70
+
71
+ # Initialize & fit
72
+ clf = HRBoostClassifier(
73
+ n_estimators=100,
74
+ learning_rate=0.1,
75
+ max_depth=4,
76
+ random_state=42,
77
+ objective="multiclass"
78
+ )
79
+ clf.fit(X_train, y_train)
80
+
81
+ # Predict probabilities and classes
82
+ probs = clf.predict_proba(X_test)
83
+ preds = clf.predict(X_test)
84
+
85
+ accuracy = np.mean(preds == y_test)
86
+ print(f"Accuracy: {accuracy:.4f}")
87
+ ```
88
+
89
+ ### 2. Regression (`HRBoostRegressor`)
90
+ `HRBoostRegressor` models continuous target values with Mean Squared Error (MSE) objective.
91
+
92
+ ```python
93
+ from sklearn.datasets import load_diabetes
94
+ from sklearn.metrics import mean_squared_error
95
+ from hrboost import HRBoostRegressor
96
+
97
+ # Load diabetes dataset
98
+ diabetes = load_diabetes()
99
+ X_train, X_test, y_train, y_test = train_test_split(
100
+ diabetes.data, diabetes.target, test_size=0.2, random_state=42
101
+ )
102
+
103
+ # Initialize & fit
104
+ reg = HRBoostRegressor(
105
+ n_estimators=150,
106
+ learning_rate=0.08,
107
+ max_depth=4,
108
+ random_state=42
109
+ )
110
+ reg.fit(X_train, y_train)
111
+
112
+ # Predict
113
+ preds = reg.predict(X_test)
114
+ mse = mean_squared_error(y_test, preds)
115
+ print(f"MSE: {mse:.4f}")
116
+ ```
117
+
118
+ ### 3. Dynamic Regularization Sensitivity (`COHESION_REG`)
119
+ You can tune BHC's dynamic regularization cohesion penalty via the environment variable:
120
+
121
+ ```bash
122
+ export COHESION_REG=0.5
123
+ python your_script.py
124
+ ```
125
+
126
+ ---
127
+
128
+ ## License
129
+
130
+ This project is licensed under the MIT License.
@@ -0,0 +1,8 @@
1
+ hrboost/__init__.py,sha256=gZPju5sozMKu6MSBkwREw-wDMYSnQHKeOsLUDPbnbLY,108
2
+ hrboost/_lib.py,sha256=lYaf5cdRSX2wInSBETGIxGqEm_SVmwzK9GKue1siTak,2637
3
+ hrboost/libhrboost.dylib,sha256=NpAUEqqZsxB2E4nf9vecIDaX4uWpvc-V5BQTG1z_Hf0,116032
4
+ hrboost/model.py,sha256=x8rTnJtwhiHIB4PsvpTBkxXrYvO9adJITLdXeeTXAiM,8708
5
+ hrboost-0.1.0.dist-info/METADATA,sha256=bccFjAxc_HOmgimZcs84O8_OX1SJuSlB_HBLIhLmqTQ,3771
6
+ hrboost-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
7
+ hrboost-0.1.0.dist-info/top_level.txt,sha256=Fpt5lBH26NdndNLH02GfemBIzBKqu0f_Fo9ifpDFOGo,8
8
+ hrboost-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ hrboost