oqboost 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
oqboost/__init__.py ADDED
@@ -0,0 +1,33 @@
1
+ """
2
+ OQBoost — Oblique Gradient-Boosted Decision Trees.
3
+
4
+ Gradient-boosted oblique decision trees where split directions are optimized
5
+ by a C++ BFS engine with zero GPU-CPU sync overhead during training.
6
+
7
+ Quickstart
8
+ ----------
9
+ >>> from oqboost import OQBoostClassifier
10
+ >>> clf = OQBoostClassifier(n_estimators=500, max_depth=6)
11
+ >>> clf.fit(X_train, y_train, eval_set=[(X_val, y_val)])
12
+ >>> clf.predict_proba(X_test)
13
+ """
14
+
15
+ from ._classifier import OQBoostClassifier
16
+ from ._regressor import OQBoostRegressor
17
+ from ._oqboost import OQBoostTree
18
+
19
+
20
+ def load_model(path: str) -> OQBoostClassifier | OQBoostRegressor:
21
+ """Load a model saved with ``clf.save(path)``."""
22
+ # joblib.load retrieves the actual pickled estimator type
23
+ import joblib
24
+ return joblib.load(path)
25
+
26
+
27
+ __version__ = "0.1.3"
28
+ __all__ = [
29
+ "OQBoostClassifier",
30
+ "OQBoostRegressor",
31
+ "OQBoostTree",
32
+ "load_model",
33
+ ]
oqboost/_classifier.py ADDED
@@ -0,0 +1,499 @@
1
+ from __future__ import annotations
2
+
3
+ import numpy as np
4
+ from sklearn.base import BaseEstimator, ClassifierMixin
5
+ from sklearn.utils.validation import check_is_fitted
6
+
7
+
8
+ class OQBoostClassifier(BaseEstimator, ClassifierMixin):
9
+ """
10
+ OQBoost: gradient-boosted oblique decision trees.
11
+
12
+ Split directions are found by a C++ engine running a candidate tournament
13
+ per node (axis scan, inherited A/B/C mutations, pobs_sis orthogonal
14
+ blocks, direction cache) with lazy best-first tree growth and a
15
+ depth-adaptive candidate budget.
16
+
17
+ Two effective hyperparameters per tree (max_depth, reg_lambda).
18
+ Native missing-value handling (numeric NaN → mean imputation baked into
19
+ the binning context) and native categorical handling (per-round
20
+ gradient-rank target encoding; categories participate in both axis
21
+ scans and oblique projections). Columns named in ``cat_features`` are
22
+ internally moved after the numeric block; category values must be
23
+ numeric IDs (floats holding integers). NaN is allowed anywhere.
24
+
25
+ Parameters
26
+ ----------
27
+ n_estimators : int
28
+ Number of boosting rounds.
29
+ learning_rate : float
30
+ Shrinkage applied to each tree's leaf values.
31
+ max_depth : int
32
+ Maximum tree depth; leaf budget is 2^max_depth (depth=6 → 64 leaves,
33
+ matching XGBoost/CatBoost defaults).
34
+ reg_lambda : float
35
+ L2 regularisation on leaf weights (Newton step denominator).
36
+ subsample : float
37
+ Fraction of training samples used to build each tree.
38
+ early_stopping_rounds : int or None
39
+ Stop if validation loss does not improve for this many rounds.
40
+ random_state : int or None
41
+ Seed for reproducibility.
42
+ verbose : bool
43
+ Print per-round metrics during training.
44
+ cat_features : list of str or int, optional
45
+ Column names (if X is a DataFrame) or column indices treated as
46
+ categorical. These features are excluded from numerical projections.
47
+ class_weight : str or None
48
+ "balanced" applies a prior-corrected argmax decision rule in
49
+ ``predict`` (training itself is always unweighted, so
50
+ ``predict_proba`` stays calibrated). Improves balanced accuracy
51
+ under class imbalance with no log-loss cost. Pass None for the
52
+ plain argmax rule.
53
+ prior_alpha : float
54
+ Strength of the prior correction used when ``class_weight ==
55
+ "balanced"``: predictions are ``argmax P / prior**alpha``.
56
+ 0 = plain MAP rule (max raw accuracy), 1 = full correction
57
+ (max balanced accuracy). The default 0.5 is the geometric-mean
58
+ compromise and typically maximises macro-F1.
59
+ inherited_rp_ratio : float
60
+ Fraction of split-direction candidates generated by parent weight
61
+ inheritance (vs fresh gradient-guided random projections).
62
+ mutation_rate : float
63
+ Base noise scale for axis-maintaining mutation (Strategy A);
64
+ decays with node depth as rate/sqrt(1+depth).
65
+ mutation_strength : float
66
+ Base weight for new-axis borrowing (Strategy B);
67
+ decays with node depth as strength/(1+depth).
68
+ pobs : bool
69
+ Inject 8 pobs_sis candidates (SIS-weighted support, exact
70
+ Haar-orthogonal block) into every node's tournament, carved from
71
+ the inherited budget. Validated to improve logloss/AUC on real
72
+ benchmarks; set False to revert to the pure A/B/C pool.
73
+ """
74
+
75
+ def __init__(
76
+ self,
77
+ n_estimators: int = 1000,
78
+ learning_rate: float = 0.03,
79
+ max_depth: int = 6,
80
+ reg_lambda: float = 1.0,
81
+ subsample: float = 0.8,
82
+ early_stopping_rounds: int | None = 50,
83
+ random_state: int | None = None,
84
+ verbose: bool = False,
85
+ cat_features: list | None = None,
86
+ class_weight: str | None = None,
87
+ prior_alpha: float = 0.5,
88
+ inherited_rp_ratio: float = 1.0,
89
+ mutation_rate: float = 0.1,
90
+ mutation_strength: float = 0.5,
91
+ pobs: bool = False,
92
+ goss: bool = False,
93
+ goss_top_rate: float = 0.2,
94
+ goss_other_rate: float = 0.1,
95
+ reg_alpha: float = 0.0,
96
+ gamma: float = 0.0,
97
+ min_child_weight: float = 1.0,
98
+ max_leaves: int | None = None,
99
+ max_bin: int = 255,
100
+ colsample_bynode: float = 1.0,
101
+ multi_strategy: str = "shared",
102
+ ):
103
+ self.n_estimators = n_estimators
104
+ self.learning_rate = learning_rate
105
+ self.max_depth = max_depth
106
+ self.reg_lambda = reg_lambda
107
+ self.subsample = subsample
108
+ self.early_stopping_rounds = early_stopping_rounds
109
+ self.random_state = random_state
110
+ self.verbose = verbose
111
+ self.cat_features = cat_features
112
+ self.class_weight = class_weight
113
+ self.prior_alpha = prior_alpha
114
+ self.inherited_rp_ratio = inherited_rp_ratio
115
+ self.mutation_rate = mutation_rate
116
+ self.mutation_strength = mutation_strength
117
+ self.pobs = pobs
118
+ self.goss = goss
119
+ self.goss_top_rate = goss_top_rate
120
+ self.goss_other_rate = goss_other_rate
121
+ self.reg_alpha = reg_alpha
122
+ self.gamma = gamma
123
+ self.min_child_weight = min_child_weight
124
+ self.max_leaves = max_leaves
125
+ self.max_bin = max_bin
126
+ self.colsample_bynode = colsample_bynode
127
+ self.multi_strategy = multi_strategy
128
+ self.feature_names_in_ = None
129
+
130
+ # ── public fit/predict ────────────────────────────────────────────────────
131
+
132
+ def _prepare_data(self, X, is_fit=False):
133
+ import pandas as pd
134
+ if not isinstance(X, pd.DataFrame):
135
+ if hasattr(X, "values"):
136
+ X = X.values
137
+ return np.asarray(X, dtype=np.float32)
138
+
139
+ X_prep = X.copy()
140
+
141
+ if is_fit:
142
+ self.feature_names_in_ = list(X.columns)
143
+ self._cat_mappings_ = {}
144
+
145
+ cat_cols = []
146
+ if self.cat_features:
147
+ for cf in self.cat_features:
148
+ if isinstance(cf, (int, np.integer)):
149
+ if 0 <= cf < len(X.columns):
150
+ cat_cols.append(X.columns[cf])
151
+ elif cf in X.columns:
152
+ cat_cols.append(cf)
153
+ else:
154
+ for col in X.columns:
155
+ if isinstance(X[col].dtype, pd.CategoricalDtype) or X[col].dtype == object:
156
+ cat_cols.append(col)
157
+
158
+ if is_fit and not self.cat_features and cat_cols:
159
+ self.cat_features = cat_cols
160
+
161
+ for col in cat_cols:
162
+ if not isinstance(X_prep[col].dtype, pd.CategoricalDtype):
163
+ X_prep[col] = X_prep[col].astype('category')
164
+
165
+ if is_fit:
166
+ self._cat_mappings_[col] = list(X_prep[col].cat.categories)
167
+ else:
168
+ if hasattr(self, "_cat_mappings_") and col in self._cat_mappings_:
169
+ X_prep[col] = pd.Categorical(
170
+ X_prep[col], categories=self._cat_mappings_[col]
171
+ )
172
+
173
+ codes = X_prep[col].cat.codes.astype(np.float32)
174
+ codes[codes == -1] = np.nan
175
+ X_prep[col] = codes
176
+
177
+ return X_prep.values.astype(np.float32)
178
+
179
+ def fit(
180
+ self,
181
+ X,
182
+ y,
183
+ eval_set: list[tuple] | None = None,
184
+ sample_weight=None,
185
+ ) -> OQBoostClassifier:
186
+ """
187
+ Fit the classifier.
188
+
189
+ Parameters
190
+ ----------
191
+ X : array-like of shape (n_samples, n_features)
192
+ y : array-like of shape (n_samples,)
193
+ eval_set : list of (X_val, y_val) tuples, optional
194
+ First tuple is used for early stopping and validation metrics.
195
+ sample_weight : array-like of shape (n_samples,), optional
196
+ Individual weights for each sample.
197
+ """
198
+ X = self._prepare_data(X, is_fit=True)
199
+ y = np.asarray(y, dtype=np.int64)
200
+
201
+ self.n_features_in_ = X.shape[1]
202
+ self.classes_ = np.unique(y)
203
+
204
+ if sample_weight is not None:
205
+ sample_weight = np.asarray(sample_weight, dtype=np.float32)
206
+ if sample_weight.ndim != 1 or len(sample_weight) != X.shape[0]:
207
+ raise ValueError("sample_weight must be a 1D array of length N.")
208
+
209
+ D_num = self._resolve_D_num(X.shape[1])
210
+
211
+ X_val, y_val = None, None
212
+ if eval_set:
213
+ X_val, y_val = eval_set[0]
214
+ X_val = self._prepare_data(X_val, is_fit=False)
215
+ y_val = np.asarray(y_val, dtype=np.int64)
216
+
217
+ self._fit_core(X, y, X_val, y_val, D_num, sample_weight=sample_weight)
218
+ return self
219
+
220
+ def predict(self, X) -> np.ndarray:
221
+ check_is_fitted(self, "trees_")
222
+ P = self.predict_proba(X)
223
+ # Balanced decision rule: argmax of prior-corrected probabilities
224
+ # P / prior^alpha. Training stays unweighted (calibrated probas);
225
+ # only the decision threshold shifts. alpha=1 maximises balanced
226
+ # accuracy, alpha=0 is the plain MAP rule; the default alpha=0.5
227
+ # (geometric-mean compromise) keeps most of the balanced-accuracy
228
+ # gain while staying within ~1.5 pp of the best raw accuracy and
229
+ # typically maximises macro-F1.
230
+ alpha = getattr(self, "prior_alpha", 0.5)
231
+ if (getattr(self, "class_weight", None) == "balanced"
232
+ and getattr(self, "_prior_", None) is not None
233
+ and alpha > 0.0):
234
+ prior = np.asarray(self._prior_, dtype=np.float32)[None, :]
235
+ P = P / np.power(prior, alpha)
236
+ return P.argmax(axis=1)
237
+
238
+ def predict_proba(self, X) -> np.ndarray:
239
+ if hasattr(self, "ovr_ensembles_") and self.ovr_ensembles_:
240
+ X_orig = X
241
+ N = X_orig.shape[0]
242
+ K = len(self.ovr_ensembles_)
243
+ logits = np.zeros((N, K), dtype=np.float32)
244
+ from ._oqboost import predict_ensemble
245
+ for c in range(K):
246
+ clf_c = self.ovr_ensembles_[c]
247
+ X_c = clf_c._prepare_data(X_orig, is_fit=False)
248
+ X_c = np.ascontiguousarray(X_c, dtype=np.float32)
249
+ if getattr(clf_c, "_col_perm_", None) is not None:
250
+ X_c = np.ascontiguousarray(X_c[:, clf_c._col_perm_])
251
+ F = predict_ensemble(
252
+ clf_c.trees_, X_c, 2, clf_c.learning_rate,
253
+ np.array(clf_c.F_init_, dtype=np.float32)
254
+ )
255
+ logits[:, c] = F[:, 1] - F[:, 0]
256
+
257
+ logits_sh = logits - logits.max(axis=1, keepdims=True)
258
+ P = np.exp(logits_sh)
259
+ P /= P.sum(axis=1, keepdims=True)
260
+ return P
261
+
262
+ check_is_fitted(self, "trees_")
263
+
264
+ X = self._prepare_data(X, is_fit=False)
265
+ X = np.ascontiguousarray(X, dtype=np.float32)
266
+ if getattr(self, "_col_perm_", None) is not None:
267
+ X = np.ascontiguousarray(X[:, self._col_perm_])
268
+ N = X.shape[0]
269
+
270
+ from ._oqboost import predict_ensemble
271
+ K = len(self.F_init_)
272
+ F = predict_ensemble(self.trees_, X, K, self.learning_rate,
273
+ np.array(self.F_init_, dtype=np.float32))
274
+
275
+ Fsh = F - F.max(axis=1, keepdims=True)
276
+ P = np.exp(Fsh); P /= P.sum(axis=1, keepdims=True)
277
+ return P
278
+
279
+ # ── save / load ───────────────────────────────────────────────────────────
280
+
281
+ def save(self, path: str) -> None:
282
+ """Save the fitted model to disk."""
283
+ import joblib
284
+ joblib.dump(self, path, compress=3)
285
+
286
+ @classmethod
287
+ def load(cls, path: str) -> OQBoostClassifier:
288
+ """Load a model saved with :meth:`save`."""
289
+ import joblib
290
+ return joblib.load(path)
291
+
292
+ def get_n_trees(self) -> int:
293
+ """Return the number of trees actually fitted."""
294
+ check_is_fitted(self, "trees_")
295
+ if hasattr(self, "ovr_ensembles_") and self.ovr_ensembles_:
296
+ return sum(len(clf.trees_) for clf in self.ovr_ensembles_)
297
+ return len(self.trees_)
298
+
299
+ # ── internal ─────────────────────────────────────────────────────────────
300
+
301
+ def _resolve_cat_idx(self, D: int) -> list[int]:
302
+ """Sorted column indices declared categorical via ``cat_features``."""
303
+ if not self.cat_features:
304
+ return []
305
+ cat_idx = set()
306
+ for cf in self.cat_features:
307
+ if isinstance(cf, (int, np.integer)):
308
+ cat_idx.add(int(cf))
309
+ else:
310
+ names = getattr(self, "feature_names_in_", None)
311
+ if names is not None and cf in names:
312
+ cat_idx.add(names.index(cf))
313
+ return sorted(cat_idx)
314
+
315
+ def _resolve_D_num(self, D: int) -> int:
316
+ """Number of numerical (non-categorical) columns."""
317
+ return D - len(self._resolve_cat_idx(D))
318
+
319
+ def _fit_core(self, X, y, X_val, y_val, D_num, sample_weight=None):
320
+ from ._oqboost import OQBoostContext
321
+
322
+ N, D = X.shape
323
+ K = int(y.max()) + 1
324
+ seed = self.random_state if self.random_state is not None else 42
325
+
326
+ if K >= 3 and self.multi_strategy == "ovr":
327
+ # Train K separate binary classifiers
328
+ self.ovr_ensembles_ = []
329
+ self.trees_ = [] # Empty placeholder for check_is_fitted
330
+ self.F_init_ = [0.0] * K # Dummy
331
+
332
+ cnt = np.bincount(y, minlength=K).astype(np.float32)
333
+ self._prior_ = (cnt / N).tolist()
334
+ self._col_perm_ = None
335
+
336
+ for c in range(K):
337
+ y_c = (y == c).astype(np.int64)
338
+
339
+ eval_set_c = None
340
+ if X_val is not None:
341
+ y_val_c = (y_val == c).astype(np.int64)
342
+ eval_set_c = [(X_val, y_val_c)]
343
+
344
+ clf_c = OQBoostClassifier(
345
+ n_estimators=self.n_estimators,
346
+ learning_rate=self.learning_rate,
347
+ max_depth=self.max_depth,
348
+ reg_lambda=self.reg_lambda,
349
+ subsample=self.subsample,
350
+ early_stopping_rounds=self.early_stopping_rounds,
351
+ random_state=seed + c if seed is not None else None,
352
+ verbose=self.verbose,
353
+ cat_features=self.cat_features,
354
+ class_weight=None,
355
+ prior_alpha=self.prior_alpha,
356
+ inherited_rp_ratio=self.inherited_rp_ratio,
357
+ mutation_rate=self.mutation_rate,
358
+ mutation_strength=self.mutation_strength,
359
+ pobs=self.pobs,
360
+ goss=self.goss,
361
+ goss_top_rate=self.goss_top_rate,
362
+ goss_other_rate=self.goss_other_rate,
363
+ reg_alpha=self.reg_alpha,
364
+ gamma=self.gamma,
365
+ min_child_weight=self.min_child_weight,
366
+ max_leaves=self.max_leaves,
367
+ max_bin=self.max_bin,
368
+ colsample_bynode=self.colsample_bynode,
369
+ multi_strategy="shared"
370
+ )
371
+ if self.verbose:
372
+ print(f" [OQBoost OVR] Fitting class {c+1}/{K}...")
373
+ clf_c.fit(X, y_c, eval_set=eval_set_c, sample_weight=sample_weight)
374
+ self.ovr_ensembles_.append(clf_c)
375
+ return
376
+
377
+ cat_idx = self._resolve_cat_idx(D)
378
+ if cat_idx and cat_idx != list(range(D_num, D)):
379
+ perm = [i for i in range(D) if i not in set(cat_idx)] + cat_idx
380
+ self._col_perm_ = np.asarray(perm, dtype=np.intp)
381
+ else:
382
+ self._col_perm_ = None
383
+ if self._col_perm_ is not None:
384
+ X = np.ascontiguousarray(X[:, self._col_perm_])
385
+ if X_val is not None:
386
+ X_val = np.ascontiguousarray(X_val[:, self._col_perm_])
387
+
388
+ cnt = np.bincount(y, minlength=K).astype(np.float32)
389
+ self._prior_ = (cnt / N).tolist()
390
+
391
+ lp = np.log(cnt / N + 1e-8).astype(np.float32); lp -= lp.mean()
392
+ self.F_init_ = lp.tolist()
393
+
394
+ Fsc = np.tile(lp, (N, 1))
395
+ F_val = np.tile(lp, (X_val.shape[0], 1)) if X_val is not None else None
396
+
397
+ oh = np.zeros((N, K), dtype=np.float32)
398
+ oh[np.arange(N), y] = 1.0
399
+
400
+ rng = np.random.default_rng(seed)
401
+
402
+ best_val_loss = float("inf")
403
+ best_trees: list = []
404
+ no_improv = 0
405
+
406
+ self.trees_: list = []
407
+
408
+ from ._oqboost import update_gradients, OQBoostContext
409
+ ctx = OQBoostContext(X, D_num=D_num, max_bin=self.max_bin)
410
+ G_w = np.empty((N, K), dtype=np.float32)
411
+ H_w = np.empty((N, K), dtype=np.float32)
412
+ full_idx = np.arange(N, dtype=np.int32)
413
+ try:
414
+ for m in range(self.n_estimators):
415
+ update_gradients(Fsc, oh, G_w, H_w)
416
+ if sample_weight is not None:
417
+ G_w *= sample_weight[:, np.newaxis]
418
+ H_w *= sample_weight[:, np.newaxis]
419
+
420
+ if self.goss:
421
+ grad_norms = np.mean(np.abs(G_w), axis=1)
422
+ top_n = int(self.goss_top_rate * N)
423
+ other_n = int(self.goss_other_rate * (N - top_n))
424
+ if top_n > 0 and other_n > 0 and (top_n + other_n) < N:
425
+ k = N - top_n
426
+ partitioned_idx = np.argpartition(grad_norms, k)
427
+ top_idx = partitioned_idx[k:]
428
+ remaining_idx = partitioned_idx[:k]
429
+ random_idx = rng.choice(remaining_idx, size=other_n, replace=False)
430
+ tree_sub = np.concatenate([top_idx, random_idx]).astype(np.int32)
431
+ scale_factor = (1.0 - self.goss_top_rate) / self.goss_other_rate
432
+ G_w[random_idx] *= scale_factor
433
+ H_w[random_idx] *= scale_factor
434
+ else:
435
+ tree_sub = full_idx
436
+ elif self.subsample < 1.0:
437
+ tree_sub = np.flatnonzero(
438
+ rng.random(N) < self.subsample
439
+ ).astype(np.int32)
440
+ if len(tree_sub) < min(N, 1000):
441
+ tree_sub = full_idx
442
+ else:
443
+ tree_sub = full_idx
444
+
445
+ t, out_pred = ctx.build(
446
+ G_w, H_w, tree_sub, self.max_depth, self.reg_lambda,
447
+ inherited_rp_ratio=self.inherited_rp_ratio,
448
+ mutation_rate=self.mutation_rate,
449
+ mutation_strength=self.mutation_strength,
450
+ seed=int(rng.integers(1 << 30)),
451
+ pobs=getattr(self, "pobs", False),
452
+ reg_alpha=self.reg_alpha,
453
+ gamma=self.gamma,
454
+ min_child_weight=self.min_child_weight,
455
+ colsample_bynode=self.colsample_bynode,
456
+ max_leaves=self.max_leaves if self.max_leaves is not None else (1 << self.max_depth),
457
+ )
458
+
459
+ self.trees_.append(t)
460
+ Fsc += self.learning_rate * out_pred
461
+
462
+ val_str = ""
463
+ if X_val is not None:
464
+ pred_val = t.predict(X_val)
465
+ F_val = F_val + self.learning_rate * pred_val
466
+ Fv_sh = F_val - F_val.max(axis=1, keepdims=True)
467
+ P_val = np.exp(Fv_sh); P_val /= P_val.sum(axis=1, keepdims=True)
468
+ val_loss = float(
469
+ -np.log(P_val[np.arange(len(y_val)), y_val].clip(1e-8)).mean()
470
+ )
471
+ val_acc = (P_val.argmax(axis=1) == y_val).mean()
472
+ val_str = f" | ValLoss={val_loss:.4f} | ValAcc={val_acc:.4f}"
473
+
474
+ if val_loss < best_val_loss:
475
+ best_val_loss = val_loss
476
+ no_improv = 0
477
+ best_trees = list(self.trees_)
478
+ else:
479
+ no_improv += 1
480
+
481
+ if self.verbose:
482
+ Fsc_sh = Fsc - Fsc.max(axis=1, keepdims=True)
483
+ Pm = np.exp(Fsc_sh)
484
+ Pm /= Pm.sum(axis=1, keepdims=True)
485
+ ll = -np.log(Pm[np.arange(N), y].clip(1e-8)).mean()
486
+ acc = (Pm.argmax(axis=1) == y).mean()
487
+ print(
488
+ f" [OQBoost] Round {m+1:3d} | Loss={ll:.4f} | "
489
+ f"Acc={acc:.4f}{val_str}"
490
+ )
491
+
492
+ if X_val is not None and self.early_stopping_rounds is not None:
493
+ if no_improv >= self.early_stopping_rounds:
494
+ if self.verbose:
495
+ print(f" [OQBoost] Early stopping at round {m+1}")
496
+ self.trees_ = best_trees
497
+ break
498
+ finally:
499
+ ctx.close()
Binary file