kernelboost 0.2.1__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {kernelboost-0.2.1 → kernelboost-0.3.0}/CHANGELOG.md +14 -0
  2. {kernelboost-0.2.1/kernelboost.egg-info → kernelboost-0.3.0}/PKG-INFO +2 -2
  3. {kernelboost-0.2.1 → kernelboost-0.3.0}/PYPI_README.md +1 -1
  4. {kernelboost-0.2.1 → kernelboost-0.3.0}/README.md +27 -28
  5. {kernelboost-0.2.1 → kernelboost-0.3.0}/kernelboost/__init__.py +1 -1
  6. {kernelboost-0.2.1 → kernelboost-0.3.0}/kernelboost/booster.py +28 -25
  7. {kernelboost-0.2.1 → kernelboost-0.3.0}/kernelboost/feature_selection.py +118 -45
  8. {kernelboost-0.2.1 → kernelboost-0.3.0}/kernelboost/kernels.c +2 -2
  9. kernelboost-0.3.0/kernelboost/libmi.dll +0 -0
  10. kernelboost-0.3.0/kernelboost/libmi.so +0 -0
  11. kernelboost-0.3.0/kernelboost/mi_bins.c +72 -0
  12. {kernelboost-0.2.1 → kernelboost-0.3.0}/kernelboost/rho_optimizer.py +0 -2
  13. {kernelboost-0.2.1 → kernelboost-0.3.0}/kernelboost/tree.py +157 -22
  14. {kernelboost-0.2.1 → kernelboost-0.3.0/kernelboost.egg-info}/PKG-INFO +2 -2
  15. {kernelboost-0.2.1 → kernelboost-0.3.0}/kernelboost.egg-info/SOURCES.txt +3 -0
  16. {kernelboost-0.2.1 → kernelboost-0.3.0}/pyproject.toml +1 -1
  17. {kernelboost-0.2.1 → kernelboost-0.3.0}/LICENSE +0 -0
  18. {kernelboost-0.2.1 → kernelboost-0.3.0}/MANIFEST.in +0 -0
  19. {kernelboost-0.2.1 → kernelboost-0.3.0}/kernelboost/backend.py +0 -0
  20. {kernelboost-0.2.1 → kernelboost-0.3.0}/kernelboost/cpu_functions.py +0 -0
  21. {kernelboost-0.2.1 → kernelboost-0.3.0}/kernelboost/estimator.py +0 -0
  22. {kernelboost-0.2.1 → kernelboost-0.3.0}/kernelboost/gpu_functions.py +0 -0
  23. {kernelboost-0.2.1 → kernelboost-0.3.0}/kernelboost/kernels.cu +0 -0
  24. {kernelboost-0.2.1 → kernelboost-0.3.0}/kernelboost/libkernels.dll +0 -0
  25. {kernelboost-0.2.1 → kernelboost-0.3.0}/kernelboost/libkernels.so +0 -0
  26. {kernelboost-0.2.1 → kernelboost-0.3.0}/kernelboost/multiclassbooster.py +0 -0
  27. {kernelboost-0.2.1 → kernelboost-0.3.0}/kernelboost/objectives.py +0 -0
  28. {kernelboost-0.2.1 → kernelboost-0.3.0}/kernelboost/optimizer.py +0 -0
  29. {kernelboost-0.2.1 → kernelboost-0.3.0}/kernelboost/utilities.py +0 -0
  30. {kernelboost-0.2.1 → kernelboost-0.3.0}/kernelboost.egg-info/dependency_links.txt +0 -0
  31. {kernelboost-0.2.1 → kernelboost-0.3.0}/kernelboost.egg-info/requires.txt +0 -0
  32. {kernelboost-0.2.1 → kernelboost-0.3.0}/kernelboost.egg-info/top_level.txt +0 -0
  33. {kernelboost-0.2.1 → kernelboost-0.3.0}/setup.cfg +0 -0
@@ -2,6 +2,20 @@
2
2
 
3
3
  All notable changes to this project will be documented in this file.
4
4
 
5
+ ## [0.3.0] - 2026-03-07
6
+
7
+ ### Added
8
+ - Constant-leaf tree mode (`tree_type='constant'`) with vectorized MSE reduction splitting.
9
+ - MI-based relevance scoring in SmartSelector using histogram mutual information.
10
+ - C implementation of MI computation (`mi_bins.c`) with OpenMP parallelization.
11
+ - Temperature scheduling in SmartSelector via `temperature_max` parameter.
12
+ - `constant_tree_frequency` parameter in SmartSelector to control constant-leaf round frequency.
13
+
14
+ ### Changed
15
+ - **Breaking**: `feature_list` renamed to `feature_tree_tuple`. Now accepts `(feature_indices, tree_type)` tuples.
16
+ - Feature selectors now return `(features, tree_type)` tuples from `get_features()`.
17
+ - `feature_importances_` now correctly accumulates all rounds instead of losing duplicates.
18
+
5
19
  ## [0.2.1] - 2026-03-01
6
20
 
7
21
  ### Changed
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: kernelboost
3
- Version: 0.2.1
3
+ Version: 0.3.0
4
4
  Summary: Gradient boosting with kernel regression base learners
5
5
  Author-email: tlaiho <tslaiho@gmail.com>
6
6
  License: MIT
@@ -29,7 +29,7 @@ Requires-Dist: cupy>=11.0.0; extra == "all"
29
29
  ![C](https://img.shields.io/badge/C-language-blue)
30
30
  ![GPU](https://img.shields.io/badge/GPU-CUDA%20C%2FCuPy-orange)
31
31
  ![License](https://img.shields.io/badge/license-MIT-green)
32
- ![Version](https://img.shields.io/badge/version-0.2.0-blue)
32
+ ![Version](https://img.shields.io/badge/version-0.3.0-blue)
33
33
 
34
34
  KernelBoost is a gradient boosting algorithm that uses Nadaraya-Watson (local constant) kernel estimators as base learners instead of decision trees. It has:
35
35
 
@@ -7,7 +7,7 @@
7
7
  ![C](https://img.shields.io/badge/C-language-blue)
8
8
  ![GPU](https://img.shields.io/badge/GPU-CUDA%20C%2FCuPy-orange)
9
9
  ![License](https://img.shields.io/badge/license-MIT-green)
10
- ![Version](https://img.shields.io/badge/version-0.2.0-blue)
10
+ ![Version](https://img.shields.io/badge/version-0.3.0-blue)
11
11
 
12
12
  KernelBoost is a gradient boosting algorithm that uses Nadaraya-Watson (local constant) kernel estimators as base learners instead of decision trees. It has:
13
13
 
@@ -7,7 +7,7 @@
7
7
  ![C](https://img.shields.io/badge/C-language-blue)
8
8
  ![GPU](https://img.shields.io/badge/GPU-CUDA%20C%2FCuPy-orange)
9
9
  ![License](https://img.shields.io/badge/license-MIT-green)
10
- ![Version](https://img.shields.io/badge/version-0.2.1-blue)
10
+ ![Version](https://img.shields.io/badge/version-0.3.0-blue)
11
11
 
12
12
  kernelboost is a gradient boosting algorithm that uses Nadaraya-Watson (local constant) kernel estimators as base learners instead of decision trees. It has:
13
13
 
@@ -25,7 +25,7 @@ pip install kernelboost
25
25
  pip install cupy-cuda12x # for CUDA 12
26
26
  ```
27
27
 
28
- > **Dependencies**: NumPy only. CuPy optional for GPU acceleration.
28
+ > **Dependencies**: NumPy. CuPy optional for GPU acceleration.
29
29
 
30
30
  ## Quick Start
31
31
 
@@ -61,17 +61,17 @@ There are three main components to kernelboost: KernelBooster class that does th
61
61
 
62
62
  After calling fit, KernelBooster starts a training loop which is mostly identical to the algorithm described in Friedman (2001). The main difference is that KernelTree does not choose features through its splits but is instead given them by the booster class. Default feature selection is random with increasing kernel sizes in terms of number of features. Random feature selection naturally creates randomness to training results, which can be mitigated with a lower learning rate and more boosting iterations. Similarly to Friedman (2001), KernelBooster can fit several different objective functions, which are passed in as an Objective class.
63
63
 
64
- KernelTree splits numerical data by density and categorical data by MSE. The idea here is that the kernel bandwidth should largely depend on how dense the data is. For numerical data, KernelTree splits until number of observations is below the 'max_sample' parameter. Besides finding regions which would be well served by the same bandwidth, this has the benefit of speeding up computation significantly in calculating the kernel matrices for the kernel estimator. For example, with ten splits we go from computing a (n, n) matrix to computing ten (n/10, n/10) matrices with n²/10 operations instead of n² (assuming equal splits). This saves a whopping 90% of compute.
64
+ KernelTree splits numerical data by density and categorical data by MSE. It can also fit pure decision trees with mean values at leaves. The idea here is that the kernel bandwidth should largely depend on how dense the data is. For numerical data, KernelTree splits until number of observations is below the 'max_sample' parameter. Besides finding regions which would be well served by the same bandwidth, this has the benefit of speeding up computation significantly in calculating the kernel matrices for the kernel estimator. For example, with ten splits we go from computing a (n, n) matrix to computing ten (n/10, n/10) matrices with n²/10 operations instead of n² (assuming equal splits). This saves a nice 90% of compute.
65
65
 
66
- The actual estimation is handled by KernelEstimator. It optimizes a scalar precision (inverse bandwidth) for the local constant estimator using leave-one-out cross validation and random search between given bounds. It has both Gaussian and (isotropic) Laplace kernels with default being the Laplace kernel. KernelEstimator also has uncertainty quantification methods for quantile and conditional variance prediction, but these are at this moment still experimental as they use a "naive" single kernel method whose precision is optimized for mean prediction.
66
+ The actual estimation is handled by KernelEstimator. It optimizes a scalar precision (inverse bandwidth) for the local constant estimator using leave-one-out cross validation and random search between given bounds. It has both Gaussian and (isotropic) Laplace kernels with default being the Laplace kernel. KernelEstimator also has uncertainty quantification methods for quantile and conditional variance prediction (Fan & Yao 1998).
67
67
 
68
68
  ### Notable features
69
69
 
70
- Beyond the core boosting algorithm, kernelboost includes a few features worth highlighting:
70
+ Beyond the core boosting algorithm, a few features worth highlighting:
71
71
 
72
72
  #### Smart Feature Selection
73
73
 
74
- While the default feature selection is random (RandomSelector), the package includes an mRMR style probabilistic algorithm (SmartSelector) based on correlations between features and pseudo-residuals and performance in previous boosting rounds.
74
+ While the default feature selection is random (RandomSelector), the package includes an mRMR style probabilistic algorithm (SmartSelector) based on mutual information between features and pseudo-residuals and loss gain in previous boosting rounds.
75
75
 
76
76
  ```python
77
77
  from kernelboost.feature_selection import SmartSelector
@@ -79,7 +79,6 @@ from kernelboost.feature_selection import SmartSelector
79
79
  selector = SmartSelector(
80
80
  redundancy_penalty=0.4,
81
81
  relevance_alpha=0.7,
82
- recency_penalty=0.3,
83
82
  )
84
83
 
85
84
  booster = KernelBooster(
@@ -113,7 +112,7 @@ lambda1, learning_rate = opt.find_hyperparameters()
113
112
 
114
113
  #### Uncertainty Quantification (Experimental)
115
114
 
116
- KernelBooster has both prediction intervals and conditional variance prediction based on kernel estimation. These come for "free" on top of training and require no extra data. Still work in progress.
115
+ KernelBooster has both prediction intervals and conditional variance prediction (Fan & Yao 1998) based on kernel estimation. These require no extra data and in that sense come for "free" on top of training. Still work in progress.
117
116
 
118
117
  ```python
119
118
  # Prediction intervals (90% by default)
@@ -123,7 +122,7 @@ lower, upper = booster.predict_intervals(X, alpha=0.1)
123
122
  variance = booster.predict_variance(X)
124
123
  ```
125
124
 
126
- Both interval coverage and conditional variance have a tendency to be underestimated, but this depends on the data and how well boosting has converged. No special tuning required: settings that optimize MSE have also given reasonable uncertainty estimates in testing. See [benchmarks](#uncertainty-quantification-california-housing) for a comparison with Gaussian Processes.
125
+ Both interval coverage and conditional variance have a tendency to be underestimated. See [benchmarks](#uncertainty-quantification-california-housing) for a comparison with Gaussian Processes.
127
126
 
128
127
  #### Data Preprocessing
129
128
 
@@ -197,10 +196,10 @@ Results have inherent randomness due to feature selection and subsampling. Scrip
197
196
  =================================================================
198
197
  Model MSE MAE R² Time
199
198
  -----------------------------------------------------------------
200
- kernelboost 0.2053 0.2985 0.8452 11.0s
201
- sklearn HGBR 0.2247 0.3146 0.8306 0.1s
202
- XGBoost 0.2155 0.3050 0.8376 0.1s
203
- LightGBM 0.2097 0.3047 0.8419 0.1s
199
+ kernelboost 0.1790 0.2781 0.8651 12.9s
200
+ sklearn HGBR 0.2103 0.3018 0.8415 0.2s
201
+ XGBoost 0.2080 0.2962 0.8432 0.1s
202
+ LightGBM 0.1972 0.2894 0.8513 0.1s
204
203
  =================================================================
205
204
  ```
206
205
 
@@ -209,10 +208,10 @@ LightGBM 0.2097 0.3047 0.8419 0.1s
209
208
  =================================================================
210
209
  Model Accuracy AUC-ROC F1 Time
211
210
  -----------------------------------------------------------------
212
- kernelboost 0.9825 0.9984 0.9861 1.6s
213
- sklearn HGBC 0.9649 0.9944 0.9722 0.1s
214
- XGBoost 0.9561 0.9938 0.9650 0.0s
215
- LightGBM 0.9649 0.9925 0.9722 0.0s
211
+ kernelboost 0.9825 0.9984 0.9861 2.1s
212
+ sklearn HGBC 0.9649 0.9948 0.9722 0.1s
213
+ XGBoost 0.9561 0.9941 0.9650 0.0s
214
+ LightGBM 0.9737 0.9921 0.9790 0.0s
216
215
  =================================================================
217
216
  ```
218
217
 
@@ -223,10 +222,10 @@ Kernel Methods Benchmark (n_train=10000)
223
222
  =================================================================
224
223
  Model MSE MAE R² Time
225
224
  -----------------------------------------------------------------
226
- kernelboost 0.2091 0.3054 0.8430 6.5s
227
- KernelRidge 0.4233 0.4835 0.6822 1.7s
228
- SVR 0.3136 0.3780 0.7646 3.5s
229
- GP (n=5000) 0.3297 0.4061 0.7524 67.7s
225
+ kernelboost 0.2027 0.2936 0.8456 4.7s
226
+ KernelRidge 0.4258 0.4828 0.6756 1.5s
227
+ SVR 0.3133 0.3766 0.7613 3.3s
228
+ GP (n=5000) 0.3300 0.4038 0.7485 29.8s
230
229
  =================================================================
231
230
  ```
232
231
 
@@ -237,10 +236,10 @@ Prediction intervals and conditional variance estimates compared to Gaussian Pro
237
236
  =================================================================
238
237
  Uncertainty Quantification (90% intervals, alpha=0.1)
239
238
  =================================================================
240
- Model Coverage Width Var Corr Var Ratio
239
+ Model Coverage Width Var Corr Var Ratio
241
240
  -----------------------------------------------------------------
242
- kernelboost 88.1% 1.235 0.206 1.621
243
- GP (n=5000) 90.9% 1.863 0.157 1.026
241
+ kernelboost 91.4% 1.379 0.228 1.166
242
+ GP (n=5000) 91.0% 1.832 0.156 1.062
244
243
  =================================================================
245
244
  ```
246
245
 
@@ -255,10 +254,10 @@ GPU vs CPU Training Time (California Housing, n=10000)
255
254
  =================================================================
256
255
  Backend Time
257
256
  -----------------------------------------------------------------
258
- CPU (C/OpenMP) 38.6s
259
- GPU (CuPy/CUDA) 4.6s
257
+ CPU (C/OpenMP) 48.2s
258
+ GPU (CuPy/CUDA) 7.3s
260
259
  =================================================================
261
- GPU speedup: 8.3x
260
+ GPU speedup: 6.7x
262
261
  ```
263
262
 
264
263
  All benchmarks run on Ubuntu 22.04 with Ryzen 7700 and RTX 3090.
@@ -274,7 +273,7 @@ All benchmarks run on Ubuntu 22.04 with Ryzen 7700 and RTX 3090.
274
273
 
275
274
  ## About
276
275
 
277
- kernelboost is a hobby project exploring alternatives to tree-based gradient boosting. Currently v0.2.1. Pre-compiled binaries included for Linux and Windows. Contributions and feedback welcome.
276
+ kernelboost is a hobby project exploring alternatives to tree-based gradient boosting. Pre-compiled binaries included for Linux and Windows. Contributions and feedback welcome.
278
277
 
279
278
  ## License
280
279
 
@@ -1,6 +1,6 @@
1
1
  """KernelBooster: Gradient boosting with Nadaraya-Watson (local constant) estimator as base learners."""
2
2
 
3
- __version__ = "0.2.1"
3
+ __version__ = "0.3.0"
4
4
 
5
5
  from .booster import KernelBooster
6
6
  from .multiclassbooster import MulticlassBooster
@@ -10,18 +10,19 @@ class KernelBooster:
10
10
  objective : Objective
11
11
  Loss function (e.g., MSEObjective(), EntropyObjective()).
12
12
  feature_selector : FeatureSelector, default=None
13
- Feature selection strategy. If None and feature_list not provided,
13
+ Feature selection strategy. If None and feature_tree_tuple not provided,
14
14
  defaults to RandomSelector.
15
15
  feature_names : list, default=None
16
16
  Names for features. Uses indices if None.
17
- feature_list : list, default=None
18
- Explicit feature subsets per round. Takes priority over feature_selector.
19
17
  min_features : int, default=1
20
18
  Minimum features per round.
21
19
  max_features : int, default=None
22
20
  Maximum features per round. If None, uses min(10, n_features).
23
21
  n_estimators : int, default=None
24
22
  Number of boosting rounds. Auto-calculated from n_features if None.
23
+ feature_tree_tuple : tuple, default=None
24
+ Explicit feature subsets and tree type ('kernel' or 'constant') per round.
25
+ Takes priority over feature_selector.
25
26
  subsample_share : float, default=0.5
26
27
  Training sample share per round.
27
28
  lambda1 : float, default=0.0
@@ -66,10 +67,10 @@ class KernelBooster:
66
67
  objective,
67
68
  feature_selector: FeatureSelector = None,
68
69
  feature_names: list = None,
69
- feature_list: list = None,
70
70
  min_features: int = 1,
71
71
  max_features: int = None,
72
72
  n_estimators: int = None,
73
+ feature_tree_tuple: tuple = None,
73
74
  subsample_share: float = 0.5,
74
75
  lambda1: float = 0.0,
75
76
  learning_rate: float = 0.5,
@@ -94,7 +95,7 @@ class KernelBooster:
94
95
  self.feature_selector = feature_selector
95
96
 
96
97
  self.feature_names = feature_names
97
- self.feature_list = feature_list
98
+ self.feature_tree_tuple = feature_tree_tuple
98
99
  self.lambda1 = lambda1
99
100
  self.learning_rate = learning_rate
100
101
  self.n_estimators = n_estimators
@@ -163,6 +164,8 @@ class KernelBooster:
163
164
  raise ValueError(f"n_iter_no_change must be a positive integer or None, got {self.n_iter_no_change}")
164
165
  if not (0.0 <= self.overlap_epsilon < 0.5):
165
166
  raise ValueError(f"overlap_epsilon must be in [0.0, 0.5), got {self.overlap_epsilon}")
167
+ if self.feature_tree_tuple is not None and not isinstance(self.feature_tree_tuple, tuple):
168
+ raise ValueError(f"feature_tree_tuple: must be a tuple of (indices, tree_type) tuples")
166
169
 
167
170
  def _validate_data(self, X: np.ndarray, y: np.ndarray) -> None:
168
171
  """Validate training data."""
@@ -269,9 +272,6 @@ class KernelBooster:
269
272
 
270
273
  self._training_loop()
271
274
 
272
- feature_tuples = (tuple(sublist) for sublist in self.fitted_features_)
273
- self.rho_dict_ = dict(zip(feature_tuples, self.rho_))
274
-
275
275
  indices = self._last_n_active_tree_indices(1)
276
276
  self.last_active_tree_idx_ = indices[0] if indices else None
277
277
 
@@ -304,10 +304,10 @@ class KernelBooster:
304
304
  """Initialize training state."""
305
305
  self._sample_size = int(self.subsample_share * self.n_samples_)
306
306
 
307
- # priority: explicit feature_list > feature_selector > default random
308
- if self.feature_list is not None:
309
- self.n_estimators_ = len(self.feature_list)
310
- self.feature_list_ = self.feature_list
307
+ # priority: explicit feature_tree_tuple > feature_selector > default random
308
+ if self.feature_tree_tuple is not None:
309
+ self.n_estimators_ = len(self.feature_tree_tuple)
310
+ self.feature_tree_tuple_ = self.feature_tree_tuple
311
311
  self._use_selector = False
312
312
  else:
313
313
  # default to RandomSelector if no Selector given
@@ -362,11 +362,11 @@ class KernelBooster:
362
362
  """Execute one boosting iteration."""
363
363
  pseudoresiduals = self.objective.gradient(self.y_, self.predictions_)
364
364
 
365
- # get features for this round
365
+ # get features and leaf type for this round
366
366
  if self._use_selector:
367
- feature_indices = self.feature_selector.get_features(round_idx, pseudoresiduals)
367
+ feature_indices, tree_type = self.feature_selector.get_features(round_idx, pseudoresiduals)
368
368
  else:
369
- feature_indices = self.feature_list_[round_idx]
369
+ feature_indices, tree_type = self.feature_tree_tuple_[round_idx]
370
370
 
371
371
  training_features = self.X_[:, feature_indices]
372
372
  all_data = np.concatenate((pseudoresiduals, training_features), axis=1)
@@ -384,6 +384,7 @@ class KernelBooster:
384
384
  **self.tree_optimization,
385
385
  use_gpu=self.use_gpu,
386
386
  **self.kernel_optimization,
387
+ tree_type=tree_type,
387
388
  )
388
389
  )
389
390
  self.trees_[-1].fit(training_data[:, 1:], training_data[:, 0].reshape(-1, 1))
@@ -391,13 +392,14 @@ class KernelBooster:
391
392
  # store tree predictions for hyperparameter optimization
392
393
  self.tree_predictions_.append(self.trees_[-1].predict(training_features))
393
394
 
394
- precisions = [
395
- est.precision_ for est, is_kern in zip(
396
- self.trees_[-1].compiled_.estimators, self.trees_[-1].compiled_.is_kernel
397
- ) if is_kern
398
- ]
399
- if precisions:
400
- self.last_precision_ = np.mean(precisions)
395
+ if tree_type == 'kernel':
396
+ precisions = [
397
+ est.precision_ for est, is_kern in zip(
398
+ self.trees_[-1].compiled_.estimators, self.trees_[-1].compiled_.is_kernel
399
+ ) if is_kern
400
+ ]
401
+ if precisions:
402
+ self.last_precision_ = np.mean(precisions)
401
403
 
402
404
  self.rho_.append(
403
405
  self.objective.line_search(
@@ -742,7 +744,7 @@ class KernelBooster:
742
744
  return {
743
745
  'objective': self.objective,
744
746
  'feature_names': self.feature_names,
745
- 'feature_list': self.feature_list,
747
+ 'feature_tree_tuple': self.feature_tree_tuple,
746
748
  'feature_selector': self.feature_selector,
747
749
  'max_depth': self.max_depth,
748
750
  'max_sample': self.max_sample,
@@ -782,6 +784,7 @@ class KernelBooster:
782
784
  'initial_precision': self.initial_precision,
783
785
  'sample_share': self.sample_share,
784
786
  'precision_method': self.precision_method,
787
+ 'pilot_factor': self.pilot_factor,
785
788
  }
786
789
 
787
790
  self.tree_optimization = {
@@ -814,10 +817,10 @@ class KernelBooster:
814
817
  @property
815
818
  def feature_importances_(self) -> np.ndarray:
816
819
  """Feature importance based on aggregated |rho| values."""
817
- if not hasattr(self, 'rho_dict_'):
820
+ if not hasattr(self, 'trees_'):
818
821
  raise RuntimeError("Booster not fitted. Call fit() first.")
819
822
  importances = np.zeros(self.n_features_in_)
820
- for feature_indices, rho in self.rho_dict_.items():
823
+ for feature_indices, rho in zip(self.fitted_features_, self.rho_):
821
824
  for idx in feature_indices:
822
825
  importances[idx] += abs(rho)
823
826
  total = importances.sum()
@@ -1,6 +1,45 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from collections.abc import Generator
3
+ import ctypes
4
+ import os
5
+ import platform
6
+
3
7
  import numpy as np
8
+ from numpy.ctypeslib import ndpointer
9
+
10
+ # Load C library for fast MI computation
11
+ _dir_path = os.path.dirname(os.path.realpath(__file__))
12
+
13
+ _system = platform.system()
14
+ if _system == "Linux":
15
+ _mi_libname = f"{_dir_path}/libmi.so"
16
+ elif _system == "Windows":
17
+ _mi_libname = f"{_dir_path}/libmi.dll"
18
+ elif _system == "Darwin":
19
+ _mi_libname = f"{_dir_path}/libmi.dylib"
20
+ else:
21
+ raise Exception(f"Platform '{_system}' not supported.")
22
+
23
+ try:
24
+ _mi_lib = ctypes.CDLL(_mi_libname)
25
+ except OSError:
26
+ raise OSError(
27
+ f"Could not load C library at {_mi_libname}. "
28
+ f"Compile it with: gcc -shared -o {_mi_libname} -fPIC kernelboost/mi_bins.c "
29
+ f"-lm -fopenmp -O3 -march=native -ffast-math -funroll-loops -flto"
30
+ )
31
+
32
+ _mi_lib.histogram_mi_batch.restype = None
33
+ _mi_lib.histogram_mi_batch.argtypes = (
34
+ ndpointer(ctypes.c_float, flags="C_CONTIGUOUS"), # X
35
+ ndpointer(ctypes.c_float, flags="C_CONTIGUOUS"), # residuals
36
+ ctypes.c_int, # n
37
+ ctypes.c_int, # n_features
38
+ ndpointer(ctypes.c_float, flags="C_CONTIGUOUS"), # x_thresholds
39
+ ndpointer(ctypes.c_float, flags="C_CONTIGUOUS"), # y_thresholds
40
+ ctypes.c_int, # n_thresh
41
+ ndpointer(ctypes.c_float, flags="C_CONTIGUOUS"), # out_mi
42
+ )
4
43
 
5
44
 
6
45
  class FeatureSelector(ABC):
@@ -45,9 +84,9 @@ class FeatureSelector(ABC):
45
84
  pass
46
85
 
47
86
  @abstractmethod
48
- def get_features(self, round_idx: int, residuals: np.ndarray) -> list[int]:
87
+ def get_features(self, round_idx: int, residuals: np.ndarray) -> tuple[list[int], str]:
49
88
  """
50
- Get feature indices for the next boosting round.
89
+ Get feature indices and tree type for the next boosting round.
51
90
 
52
91
  Args:
53
92
  round_idx : int
@@ -56,8 +95,8 @@ class FeatureSelector(ABC):
56
95
  Current pseudo-residuals (n_samples,)
57
96
 
58
97
  Returns:
59
- list[int]
60
- Feature indices to use for this round
98
+ tuple[list[int], str]
99
+ Feature indices and leaf type ('kernel' or 'constant')
61
100
  """
62
101
  pass
63
102
 
@@ -137,32 +176,37 @@ class RandomSelector(FeatureSelector):
137
176
  self._rng.shuffle(features)
138
177
  yield features[:max_size].tolist()
139
178
 
140
- def get_features(self, round_idx: int, residuals: np.ndarray) -> list[int]:
179
+ def get_features(self, round_idx: int, residuals: np.ndarray) -> tuple[list[int], str]:
141
180
  selected = next(self._gen)
142
- return self._complete_groups(selected)
181
+ return self._complete_groups(selected), "kernel"
143
182
 
144
183
 
145
184
  class SmartSelector(FeatureSelector):
146
185
  """
147
- Feature selection using mRMR-style approach using correlations.
148
- Selects features probabilistically based on relevance, redundancy and recency.
186
+ Feature selection using mRMR-style approach with mutual information relevance.
187
+ Selects features probabilistically based on relevance, redundancy and recency.
149
188
  Kernel sizes progress from small to large.
150
189
 
151
190
  Args:
152
191
  redundancy_penalty : float, default=0.4
153
192
  Weight for redundancy penalty (0 = ignore, 1 = strong penalty)
154
- relevance_alpha : float, default=0.6
155
- Balance between residual correlation (1.0) and historical weight (0.0)
156
- recency_penalty : float, default=0.3
193
+ relevance_alpha : float, default=0.7
194
+ Balance between MI relevance (1.0) and historical weight (0.0)
195
+ recency_penalty : float, default=0.35
157
196
  Penalty applied to recently-used features (0 = no penalty, 1 = strong)
158
197
  recency_decay : float, default=0.7
159
198
  Decay factor for recency penalty each round (0 = instant decay, 1 = no decay)
160
199
  temperature : float, default=0.3
161
- Softmax temperature for feature selection. Lower = greedier, higher = more exploration.
162
- weight_decay : float, default=0.9
200
+ Softmax temperature (minimum when using schedule). Higher means more exploration.
201
+ temperature_max : float | None, default=None
202
+ Starting temperature for schedule. None means no schedule (fixed temperature).
203
+ Decays linearly from temperature_max to temperature over all rounds.
204
+ weight_decay : float, default=0.95
163
205
  Decay factor for feature weights each round.
164
206
  feature_groups : list[list[int]] | None, default=None
165
207
  Groups of features that should be selected together.
208
+ constant_tree_frequency : int, default=25
209
+ Insert a constant-leaf tree every N rounds.
166
210
  seed : int, optional
167
211
  Random seed for reproducibility.
168
212
  """
@@ -170,12 +214,14 @@ class SmartSelector(FeatureSelector):
170
214
  def __init__(
171
215
  self,
172
216
  redundancy_penalty: float = 0.4,
173
- relevance_alpha: float = 0.6,
174
- recency_penalty: float = 0.3,
217
+ relevance_alpha: float = 0.7,
218
+ recency_penalty: float = 0.35,
175
219
  recency_decay: float = 0.7,
176
220
  temperature: float = 0.3,
177
- weight_decay: float = 0.9,
221
+ temperature_max: float | None = None,
222
+ weight_decay: float = 0.95,
178
223
  feature_groups: list[list[int]] | None = None,
224
+ constant_tree_frequency: int = 25,
179
225
  seed: int | None = None,
180
226
  ):
181
227
  super().__init__()
@@ -184,10 +230,13 @@ class SmartSelector(FeatureSelector):
184
230
  self.recency_penalty = recency_penalty
185
231
  self.recency_decay = recency_decay
186
232
  self.temperature = temperature
233
+ self.temperature_max = temperature_max
187
234
  self.weight_decay = weight_decay
188
235
  self.feature_groups = feature_groups
189
236
  self.seed = seed if seed is not None else np.random.randint(0, 2**31)
190
237
 
238
+ self.constant_frequency = constant_tree_frequency
239
+
191
240
  def initialize(
192
241
  self,
193
242
  X: np.ndarray,
@@ -197,12 +246,22 @@ class SmartSelector(FeatureSelector):
197
246
  rounds: int,
198
247
  ) -> int:
199
248
  self.n_features = n_features
200
-
201
- self.X_std_ = X.std(axis=0)
202
- self.X_centered_ = X - X.mean(axis=0)
249
+ self.X_ = X.view()
250
+ self.n_bins_ = max(10, int(np.sqrt(X.shape[0] / 5)))
251
+ self.rounds_ = rounds
252
+ self.schedule_rounds_ = max(1, rounds) if self.temperature_max is not None else None
203
253
 
204
254
  self.corr_matrix_ = np.corrcoef(X, rowvar=False)
205
255
  self.corr_matrix_ = np.nan_to_num(self.corr_matrix_, nan=0.0)
256
+
257
+ # precompute quantile thresholds for each feature (fixed count for C compatibility)
258
+ self.quantiles = np.linspace(0, 1, self.n_bins_ + 1)
259
+ self.n_thresh_ = self.n_bins_ + 1
260
+ self.x_thresholds_ = np.array([
261
+ np.quantile(X[:, f], self.quantiles)
262
+ for f in range(n_features)
263
+ ], dtype=np.float32)
264
+
206
265
  self.feature_weights_ = np.zeros(n_features)
207
266
  self.recency_scores_ = np.zeros(n_features)
208
267
  self._rng = np.random.default_rng(self.seed)
@@ -229,10 +288,16 @@ class SmartSelector(FeatureSelector):
229
288
  yield max_size
230
289
 
231
290
  def get_features(self, round_idx: int, residuals: np.ndarray) -> list[int]:
232
- k = next(self._size_gen)
233
- relevance = self._compute_relevance(residuals)
234
- selected = self._select_features(k, relevance)
235
- return self._complete_groups(selected)
291
+ if round_idx > 0 and round_idx % self.constant_frequency == 0:
292
+ tree_type = "constant"
293
+ selected = list(range(self.n_features))
294
+ else:
295
+ tree_type = "kernel"
296
+ n_features = next(self._size_gen)
297
+ relevance = self._compute_relevance(residuals)
298
+ selected = self._select_features(n_features, relevance, round_idx)
299
+
300
+ return self._complete_groups(selected), tree_type
236
301
 
237
302
  def update(self, feature_indices: list[int], gain: float) -> None:
238
303
  self.recency_scores_ *= self.recency_decay
@@ -246,32 +311,43 @@ class SmartSelector(FeatureSelector):
246
311
  self.feature_weights_[idx] += weight_increment
247
312
 
248
313
  def _compute_relevance(self, pseudoresiduals: np.ndarray) -> np.ndarray:
249
- """Compute relevance scores from residual correlation, history, and recency."""
250
- pseudoresiduals = pseudoresiduals.ravel()
251
-
252
- # correlation with pseudoresiduals
253
- r_centered = pseudoresiduals - pseudoresiduals.mean()
254
- r_std = r_centered.std()
255
-
256
- cov = self.X_centered_.T @ r_centered / len(pseudoresiduals)
257
- denom = self.X_std_ * r_std
258
- correlations = np.abs(np.divide(cov, denom, out=np.zeros_like(cov), where=denom > 1e-10))
314
+ """Compute relevance scores from MI with residuals, history, and recency."""
315
+ residuals = np.ascontiguousarray(pseudoresiduals.ravel(), dtype=np.float32)
316
+ y_thresholds = np.quantile(residuals, self.quantiles).astype(np.float32)
317
+
318
+ raw_scores = np.zeros(self.n_features, dtype=np.float32)
319
+ _mi_lib.histogram_mi_batch(
320
+ np.ascontiguousarray(self.X_, dtype=np.float32),
321
+ residuals,
322
+ self.X_.shape[0],
323
+ self.n_features,
324
+ np.ascontiguousarray(self.x_thresholds_),
325
+ np.ascontiguousarray(y_thresholds),
326
+ self.n_thresh_,
327
+ raw_scores,
328
+ )
259
329
 
260
- # normalize to max values
261
- corr_norm = correlations / (correlations.max() + 1e-10)
330
+ # normalize to [0, 1]
331
+ scores_norm = raw_scores / (raw_scores.max() + 1e-10)
262
332
  weights_norm = self.feature_weights_ / (self.feature_weights_.max() + 1e-10)
263
333
 
264
334
  relevance = (
265
- self.relevance_alpha * corr_norm +
266
- (1 - self.relevance_alpha) * weights_norm -
335
+ self.relevance_alpha * scores_norm +
336
+ (1 - self.relevance_alpha) * weights_norm -
267
337
  self.recency_penalty * self.recency_scores_
268
338
  )
269
- relevance = np.maximum(relevance, 0.0)
339
+ return np.maximum(relevance, 0.0)
270
340
 
271
- return relevance
341
+ def _get_temperature(self, round_idx: int) -> float:
342
+ """Compute temperature for the current round."""
343
+ if self.schedule_rounds_ is None:
344
+ return self.temperature
345
+ progress = min(round_idx / self.schedule_rounds_, 1.0)
346
+ return self.temperature_max + (self.temperature - self.temperature_max) * progress
272
347
 
273
- def _select_features(self, k: int, relevance: np.ndarray) -> list[int]:
348
+ def _select_features(self, k: int, relevance: np.ndarray, round_idx: int) -> list[int]:
274
349
  """Select k features probabilistically using relevance and redundancy."""
350
+ temp = self._get_temperature(round_idx)
275
351
  selected = []
276
352
  available = list(range(self.n_features))
277
353
 
@@ -280,7 +356,6 @@ class SmartSelector(FeatureSelector):
280
356
  break
281
357
  scores = np.zeros(len(available))
282
358
  for i, j in enumerate(available):
283
- # redundancy with already selected
284
359
  if selected:
285
360
  redundancy = np.mean([
286
361
  abs(self.corr_matrix_[j, s]) for s in selected
@@ -290,12 +365,10 @@ class SmartSelector(FeatureSelector):
290
365
 
291
366
  scores[i] = relevance[j] - self.redundancy_penalty * redundancy
292
367
 
293
- # convert scores to probabilities
294
- scaled = scores / self.temperature
295
- exp_scores = np.exp(scaled - scaled.max()) # subtract max for numerical stability
368
+ scaled = scores / temp
369
+ exp_scores = np.exp(scaled - scaled.max())
296
370
  probs = exp_scores / exp_scores.sum()
297
371
 
298
- # select based on probabilities
299
372
  idx = self._rng.choice(len(available), p=probs)
300
373
  feat = available[idx]
301
374
 
@@ -71,8 +71,8 @@ float loo_mse(
71
71
  int n = training_obs;
72
72
  size_t tri_size = (size_t)n * (n + 1) / 2;
73
73
 
74
- // allocate upper triangle storage
75
- float *upper = (float *)malloc(tri_size * sizeof(float));
74
+ // allocate upper triangle storage, check null
75
+ float *upper = malloc(tri_size * sizeof(float));
76
76
  if (!upper) return -1.0f;
77
77
 
78
78
  // first pass: compute upper triangle kernel values
Binary file
Binary file
@@ -0,0 +1,72 @@
1
+ #include <stdio.h>
2
+ #include <stdlib.h>
3
+ #include <math.h>
4
+ #include <omp.h>
5
+
6
+ // C code for fast MI estimation over the whole feature set
7
+
8
+ static int find_bin(float *thresholds, int n_thresh, float val) {
9
+ int lo = 0, hi = n_thresh - 2;
10
+ while (lo <= hi) {
11
+ int mid = (lo + hi) / 2;
12
+ if (val < thresholds[mid])
13
+ hi = mid - 1;
14
+ else
15
+ lo = mid + 1;
16
+ }
17
+ int bin = lo - 1;
18
+ if (bin < 0) bin = 0;
19
+ if (bin > n_thresh - 2) bin = n_thresh - 2;
20
+ return bin;
21
+ }
22
+
23
+ void histogram_mi_batch(
24
+ float *X, /* (n, n_features), row-major */
25
+ float *residuals, /* (n,) */
26
+ int n,
27
+ int n_features,
28
+ float *x_thresholds, /* (n_features, n_thresh) */
29
+ float *y_thresholds, /* (n_thresh,) */
30
+ int n_thresh, /* n_bins + 1 */
31
+ float *out_mi /* (n_features,) output */ ) {
32
+
33
+ int n_bins = n_thresh - 1;
34
+ size_t binsize = (size_t) n_bins * n_bins;
35
+
36
+ #pragma omp parallel for schedule(dynamic)
37
+ for (int f=0; f < n_features; f++) {
38
+ float *hist = calloc(binsize, sizeof(float));
39
+ for (int i=0; i < n; i++) {
40
+ int xi = find_bin(x_thresholds + f * n_thresh, n_thresh, X[i * n_features + f]);
41
+ int yi = find_bin(y_thresholds, n_thresh, residuals[i]);
42
+ hist[xi * n_bins + yi] += 1;
43
+ }
44
+
45
+ // convert to probabilities, compute marginals
46
+ double inv_n = 1.0 / n;
47
+ float *pxy = calloc(binsize, sizeof(float));
48
+ float *px = calloc(n_bins, sizeof(float));
49
+ float *py = calloc(n_bins, sizeof(float));
50
+
51
+ for (int x_index=0; x_index < n_bins; x_index++){
52
+ for (int y_index=0; y_index < n_bins; y_index++){
53
+ float probability = hist[x_index * n_bins + y_index] * inv_n;
54
+ pxy[x_index * n_bins + y_index] = probability;
55
+ px[x_index] += probability;
56
+ py[y_index] += probability;
57
+ }
58
+ }
59
+
60
+ // MI
61
+ double mi = 0.0;
62
+ for (int x_index=0; x_index < n_bins; x_index++){
63
+ for (int y_index=0; y_index < n_bins; y_index++){
64
+ if (pxy[x_index * n_bins + y_index] > 0 && px[x_index] * py[y_index] > 0)
65
+ mi += (pxy[x_index * n_bins + y_index] *
66
+ log(pxy[x_index * n_bins + y_index] / (px[x_index] * py[y_index])));
67
+ }
68
+ }
69
+ out_mi[f] = fmax(0, mi);
70
+ free(hist); free(pxy); free(px); free(py);
71
+ }
72
+ }
@@ -371,8 +371,6 @@ class RhoOptimizer:
371
371
 
372
372
  self.booster_.rho_ = list(self.rho_)
373
373
 
374
- feature_tuples = (tuple(sublist) for sublist in self.booster_.fitted_features_)
375
- self.booster_.rho_dict_ = dict(zip(feature_tuples, self.booster_.rho_))
376
374
 
377
375
  if self.lambda1_ is not None:
378
376
  self.booster_.lambda1 = self.lambda1_
@@ -87,9 +87,8 @@ class CompiledTree:
87
87
 
88
88
  class KernelTree:
89
89
  """
90
- Decision tree that splits numerical features by density, categorical by MSE gain.
91
- Leaf nodes contain KernelEstimators or constants depending on sample size and
92
- whether numerical features present.
90
+ Decision tree that splits either by density (for kernel leaves) or by MSE gain
91
+ (categorical features or constant leaves).
93
92
 
94
93
  Args:
95
94
  min_sample : int, default=500
@@ -120,6 +119,13 @@ class KernelTree:
120
119
  Precision selection method: 'search' (LOO-CV) or 'silverman'.
121
120
  pilot_factor : float, default=3.0
122
121
  Multiplier for pilot precision bounds: search range is [p/factor, p*factor].
122
+ tree_type : str, default='kernel'
123
+ Leaf node type: 'kernel' for kernel estimation or 'constant' for constant leaves.
124
+ gain_threshold : float, default=1e-3
125
+ Minimum MSE gain required for a split in constant leaf mode.
126
+ quantiles : list, optional
127
+ Split candidate quantiles for constant leaf mode.
128
+ If None, defaults to linspace(0.01, 0.99, 99).
123
129
  """
124
130
 
125
131
  def __init__(
@@ -130,13 +136,16 @@ class KernelTree:
130
136
  feature_types: dict = None,
131
137
  overlap_epsilon: float = 0.05,
132
138
  use_gpu: bool = False,
133
- kernel_type: str = 'gaussian',
139
+ kernel_type: str = 'laplace',
134
140
  search_rounds: int = 20,
135
141
  bounds: tuple = (0.10, 35.0),
136
142
  initial_precision: float = 0.0,
137
143
  sample_share: float = 1.0,
138
144
  precision_method: str = 'pilot-cv',
139
145
  pilot_factor: float = 3.0,
146
+ tree_type: str = 'kernel',
147
+ gain_threshold: float = 1e-3,
148
+ quantiles: list = None,
140
149
  ):
141
150
 
142
151
  self.min_sample = min_sample
@@ -152,6 +161,13 @@ class KernelTree:
152
161
  self.sample_share = sample_share
153
162
  self.precision_method = precision_method
154
163
  self.pilot_factor = pilot_factor
164
+ self.tree_type = tree_type
165
+ self.gain_threshold = gain_threshold
166
+
167
+ if quantiles is None:
168
+ self.quantiles = np.linspace(0.01, 0.99, 99)
169
+ else:
170
+ self.quantiles = quantiles
155
171
 
156
172
  self.kernel_optimization = {
157
173
  'kernel_type': kernel_type,
@@ -163,6 +179,10 @@ class KernelTree:
163
179
  'pilot_factor': pilot_factor,
164
180
  }
165
181
 
182
+ # min sample decreased, depth increase for non-kernel trees
183
+ self._const_min_sample = max(50, self.min_sample // 5)
184
+ self._const_max_depth = self.max_depth + 3
185
+
166
186
  self._validate_params()
167
187
 
168
188
  def _validate_params(self):
@@ -180,31 +200,38 @@ class KernelTree:
180
200
  raise ValueError(f"feature_types values must be 'C' or 'N', got invalid keys: {invalid}")
181
201
  if not (0.0 <= self.overlap_epsilon < 0.5):
182
202
  raise ValueError("overlap_epsilon must be in [0.0, 0.5)")
203
+ if self.tree_type not in {"kernel", "constant"}:
204
+ raise ValueError(f"tree_type must be 'kernel' or 'constant', got '{self.tree_type}'")
205
+ if self.gain_threshold < 0:
206
+ raise ValueError(f"gain_threshold must be >= 0, got {self.gain_threshold}")
183
207
 
184
208
  def fit(self, X: np.ndarray, y: np.ndarray) -> "KernelTree":
185
209
  """Fit the tree to training data."""
186
210
  self.X_ = X.astype(np.float32)
187
211
  self.y_ = y.astype(np.float32).ravel()
188
212
  self.n_samples_, self.n_features_ = X.shape
189
- self.feature_ranges_ = self.X_.max(axis=0) - self.X_.min(axis=0)
213
+
190
214
  self._detect_types()
215
+ if not self.numerical_:
216
+ self.tree_type = "constant"
217
+
218
+ if self.tree_type == "constant":
219
+ self.categorical_ = []
220
+ self.numerical_ = list(range(self.n_features_))
221
+ # uses indices rather than values:
222
+ sorted_by_feat = [np.argsort(self.X_[:, f]) for f in range(self.n_features_)]
223
+ self.root_ = self._grow_constant(sorted_by_feat)
224
+ else:
225
+ self.feature_ranges_ = self.X_.max(axis=0) - self.X_.min(axis=0)
226
+ self.root_ = self._grow_numerical(self.X_, self.y_)
227
+ if self.categorical_:
228
+ self.root_ = self._expand_categorical(self.root_, self.X_, self.y_)
191
229
 
192
- self.root_ = self._grow_numerical(self.X_, self.y_)
193
- if self.categorical_:
194
- self.root_ = self._expand_categorical(self.root_, self.X_, self.y_)
195
230
  self.compiled_ = self._compile(self.root_)
196
231
  self.depth_ = self._compute_depth(self.root_)
232
+ del self.X_, self.y_
197
233
  return self
198
234
 
199
- def _compute_depth(self, node: Node, current: int = 0) -> int:
200
- """Compute the maximum depth of the tree."""
201
- if isinstance(node, Leaf):
202
- return current
203
- return max(
204
- self._compute_depth(node.left, current + 1),
205
- self._compute_depth(node.right, current + 1)
206
- )
207
-
208
235
  def _detect_types(self) -> None:
209
236
  """Classify features as categorical or numerical."""
210
237
  self.categorical_, self.numerical_ = [], []
@@ -225,7 +252,6 @@ class KernelTree:
225
252
  self.categorical_.append(i)
226
253
  else:
227
254
  self.numerical_.append(i)
228
- self._use_kernel = len(self.numerical_) > 0
229
255
 
230
256
  def _grow_numerical(self, X: np.ndarray, y: np.ndarray) -> Node:
231
257
  """Grow a tree on numerical features."""
@@ -297,7 +323,7 @@ class KernelTree:
297
323
  return self._make_leaf(X, y)
298
324
 
299
325
  feat, thresh, gain = split
300
- if gain < 1e-4: # no meaningful improvement
326
+ if gain < self.gain_threshold:
301
327
  return self._make_leaf(X, y)
302
328
 
303
329
  left_mask = X[:, feat] <= thresh
@@ -312,7 +338,7 @@ class KernelTree:
312
338
  best_mse, best_feat, best_thresh = base_mse, None, None
313
339
 
314
340
  for f in self.categorical_:
315
- values = np.unique(X[:, f])
341
+ values = np.unique(X[:, f])
316
342
  for thresh in values[:-1]:
317
343
  left_mask = X[:, f] <= thresh
318
344
  n_left, n_right = left_mask.sum(), (~left_mask).sum()
@@ -329,17 +355,115 @@ class KernelTree:
329
355
  return None
330
356
 
331
357
  return best_feat, best_thresh, base_mse - best_mse
332
-
358
+
333
359
  def _make_leaf(self, X: np.ndarray, y: np.ndarray) -> Leaf:
334
360
  """Create leaf with kernel estimator or mean constant."""
335
361
  n = len(y)
336
- if self._use_kernel and n <= self.max_sample:
362
+ if n <= self.max_sample:
337
363
  X_num = np.delete(X, self.categorical_, axis=1) if self.categorical_ else X
338
364
  k = KernelEstimator(use_gpu=self.use_gpu, **self.kernel_optimization)
339
365
  k.fit(X_num, y)
340
366
  return Leaf(k)
341
367
  return Leaf(float(np.mean(y)))
342
368
 
369
+ def _grow_constant(self, sorted_by_feat: list[np.ndarray], depth: int = 0) -> Node:
370
+ """Grow a tree with constant leaves."""
371
+ n = len(sorted_by_feat[0])
372
+ samples = sorted_by_feat[0]
373
+
374
+ if depth >= self._const_max_depth or n < 1.5 * self._const_min_sample:
375
+ return Leaf(float(np.mean(self.y_[samples])))
376
+
377
+ split = self._find_constant_split(sorted_by_feat, n)
378
+ if split is None:
379
+ return Leaf(float(np.mean(self.y_[samples])))
380
+
381
+ feat, thresh, gain = split
382
+ if gain < self.gain_threshold:
383
+ return Leaf(float(np.mean(self.y_[samples])))
384
+
385
+ left_sorted, right_sorted = [], []
386
+ for f in range(self.n_features_):
387
+ left_mask = self.X_[sorted_by_feat[f], feat] <= thresh
388
+ left_sorted.append(sorted_by_feat[f][left_mask])
389
+ right_sorted.append(sorted_by_feat[f][~left_mask])
390
+
391
+ return Branch(feat, thresh,
392
+ self._grow_constant(left_sorted, depth + 1),
393
+ self._grow_constant(right_sorted, depth + 1))
394
+
395
+ def _find_constant_split(
396
+ self,
397
+ sorted_by_feat: list[np.ndarray],
398
+ n: int) -> tuple[int, float, float] | None:
399
+ """Find best split by MSE reduction across all features."""
400
+ base_mse = np.var(self.y_[sorted_by_feat[0]])
401
+ best_mse, best_feat, best_thresh = base_mse, None, None
402
+ min_s = self._const_min_sample
403
+
404
+ for f in range(self.n_features_):
405
+ idx = sorted_by_feat[f]
406
+ col_sorted = self.X_[idx, f]
407
+ y_sorted = self.y_[idx].astype(np.float64)
408
+
409
+ cum_sum = np.cumsum(y_sorted)
410
+ cum_sq = np.cumsum(y_sorted ** 2)
411
+ total_sum = cum_sum[-1]
412
+ total_sq = cum_sq[-1]
413
+
414
+ values = np.quantile(col_sorted, self.quantiles)
415
+ values = values[:-1] # skip last candidate
416
+ positions = np.searchsorted(col_sorted, values, side='right')
417
+
418
+ # filter valid splits
419
+ valid = (positions >= min_s) & (positions <= n - min_s)
420
+ if not np.any(valid):
421
+ continue
422
+
423
+ pos = positions[valid]
424
+ thresholds = values[valid]
425
+
426
+ # skip duplicates — same partition, same MSE
427
+ unique_mask = np.empty(len(pos), dtype=bool)
428
+ unique_mask[0] = True
429
+ unique_mask[1:] = np.diff(pos) != 0
430
+ pos = pos[unique_mask]
431
+ thresholds = thresholds[unique_mask]
432
+
433
+ # vectorized MSE via cumulative sums
434
+ left_n = pos.astype(np.float64)
435
+ left_sum = cum_sum[pos - 1]
436
+ left_sq = cum_sq[pos - 1]
437
+ right_n = n - left_n
438
+ right_sum = total_sum - left_sum
439
+ right_sq = total_sq - left_sq
440
+
441
+ # guards against negatives
442
+ left_var = np.maximum(0.0, left_sq / left_n - (left_sum / left_n) ** 2)
443
+ right_var = np.maximum(0.0, right_sq / right_n - (right_sum / right_n) ** 2)
444
+
445
+ mse = (left_n * left_var + right_n * right_var) / n
446
+
447
+ idx_best = np.argmin(mse)
448
+ if mse[idx_best] < best_mse:
449
+ best_mse = mse[idx_best]
450
+ best_feat = f
451
+ best_thresh = float(thresholds[idx_best])
452
+
453
+ if best_feat is None:
454
+ return None
455
+
456
+ return best_feat, best_thresh, base_mse - best_mse
457
+
458
+ def _compute_depth(self, node: Node, current: int = 0) -> int:
459
+ """Compute the maximum depth of the tree."""
460
+ if isinstance(node, Leaf):
461
+ return current
462
+ return max(
463
+ self._compute_depth(node.left, current + 1),
464
+ self._compute_depth(node.right, current + 1)
465
+ )
466
+
343
467
  def _compile(self, root: Node) -> CompiledTree:
344
468
  """Convert nested tree structure to flat thresholds for prediction
345
469
  and create overlap for training data over the thresholds."""
@@ -428,6 +552,10 @@ class KernelTree:
428
552
  'initial_precision': self.initial_precision,
429
553
  'sample_share': self.sample_share,
430
554
  'precision_method': self.precision_method,
555
+ 'pilot_factor': self.pilot_factor,
556
+ 'tree_type': self.tree_type,
557
+ 'gain_threshold': self.gain_threshold,
558
+ 'quantiles': self.quantiles,
431
559
  }
432
560
 
433
561
  def set_params(self, **params) -> "KernelTree":
@@ -445,8 +573,15 @@ class KernelTree:
445
573
  'initial_precision': self.initial_precision,
446
574
  'sample_share': self.sample_share,
447
575
  'precision_method': self.precision_method,
576
+ 'pilot_factor': self.pilot_factor,
448
577
  }
449
578
 
579
+ self._const_min_sample = max(50, self.min_sample // 5)
580
+ self._const_max_depth = self.max_depth + 3
581
+
450
582
  self._validate_params()
451
583
 
452
584
  return self
585
+
586
+
587
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: kernelboost
3
- Version: 0.2.1
3
+ Version: 0.3.0
4
4
  Summary: Gradient boosting with kernel regression base learners
5
5
  Author-email: tlaiho <tslaiho@gmail.com>
6
6
  License: MIT
@@ -29,7 +29,7 @@ Requires-Dist: cupy>=11.0.0; extra == "all"
29
29
  ![C](https://img.shields.io/badge/C-language-blue)
30
30
  ![GPU](https://img.shields.io/badge/GPU-CUDA%20C%2FCuPy-orange)
31
31
  ![License](https://img.shields.io/badge/license-MIT-green)
32
- ![Version](https://img.shields.io/badge/version-0.2.0-blue)
32
+ ![Version](https://img.shields.io/badge/version-0.3.0-blue)
33
33
 
34
34
  KernelBoost is a gradient boosting algorithm that uses Nadaraya-Watson (local constant) kernel estimators as base learners instead of decision trees. It has:
35
35
 
@@ -15,6 +15,9 @@ kernelboost/kernels.c
15
15
  kernelboost/kernels.cu
16
16
  kernelboost/libkernels.dll
17
17
  kernelboost/libkernels.so
18
+ kernelboost/libmi.dll
19
+ kernelboost/libmi.so
20
+ kernelboost/mi_bins.c
18
21
  kernelboost/multiclassbooster.py
19
22
  kernelboost/objectives.py
20
23
  kernelboost/optimizer.py
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "kernelboost"
7
- version = "0.2.1"
7
+ version = "0.3.0"
8
8
  description = "Gradient boosting with kernel regression base learners"
9
9
  readme = "PYPI_README.md"
10
10
  requires-python = ">=3.9"
File without changes
File without changes
File without changes