warpgbm 0.1.13__tar.gz → 0.1.15__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {warpgbm-0.1.13/warpgbm.egg-info → warpgbm-0.1.15}/PKG-INFO +26 -13
- {warpgbm-0.1.13 → warpgbm-0.1.15}/README.md +25 -12
- {warpgbm-0.1.13 → warpgbm-0.1.15}/pyproject.toml +1 -1
- warpgbm-0.1.15/version.txt +1 -0
- warpgbm-0.1.15/warpgbm/core.py +491 -0
- {warpgbm-0.1.13 → warpgbm-0.1.15/warpgbm.egg-info}/PKG-INFO +26 -13
- warpgbm-0.1.13/version.txt +0 -1
- warpgbm-0.1.13/warpgbm/core.py +0 -241
- {warpgbm-0.1.13 → warpgbm-0.1.15}/LICENSE +0 -0
- {warpgbm-0.1.13 → warpgbm-0.1.15}/MANIFEST.in +0 -0
- {warpgbm-0.1.13 → warpgbm-0.1.15}/setup.cfg +0 -0
- {warpgbm-0.1.13 → warpgbm-0.1.15}/setup.py +0 -0
- {warpgbm-0.1.13 → warpgbm-0.1.15}/tests/__init__.py +0 -0
- {warpgbm-0.1.13 → warpgbm-0.1.15}/tests/test_fit_predict_corr.py +0 -0
- {warpgbm-0.1.13 → warpgbm-0.1.15}/warpgbm/__init__.py +0 -0
- {warpgbm-0.1.13 → warpgbm-0.1.15}/warpgbm/cuda/__init__.py +0 -0
- {warpgbm-0.1.13 → warpgbm-0.1.15}/warpgbm/cuda/best_split_kernel.cu +0 -0
- {warpgbm-0.1.13 → warpgbm-0.1.15}/warpgbm/cuda/histogram_kernel.cu +0 -0
- {warpgbm-0.1.13 → warpgbm-0.1.15}/warpgbm/cuda/node_kernel.cpp +0 -0
- {warpgbm-0.1.13 → warpgbm-0.1.15}/warpgbm.egg-info/SOURCES.txt +0 -0
- {warpgbm-0.1.13 → warpgbm-0.1.15}/warpgbm.egg-info/dependency_links.txt +0 -0
- {warpgbm-0.1.13 → warpgbm-0.1.15}/warpgbm.egg-info/requires.txt +0 -0
- {warpgbm-0.1.13 → warpgbm-0.1.15}/warpgbm.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: warpgbm
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.15
|
4
4
|
Summary: A fast GPU-accelerated Gradient Boosted Decision Tree library with PyTorch + CUDA
|
5
5
|
License: GNU GENERAL PUBLIC LICENSE
|
6
6
|
Version 3, 29 June 2007
|
@@ -700,7 +700,6 @@ WarpGBM is a high-performance, GPU-accelerated Gradient Boosted Decision Tree (G
|
|
700
700
|
- GPU-accelerated training and histogram construction using custom CUDA kernels
|
701
701
|
- Drop-in scikit-learn style interface
|
702
702
|
- Supports pre-binned data or automatic quantile binning
|
703
|
-
- Fully differentiable prediction path
|
704
703
|
- Simple install with `pip`
|
705
704
|
|
706
705
|
---
|
@@ -713,7 +712,7 @@ In our initial tests on an NVIDIA 3090 (local) and A100 (Google Colab Pro), Warp
|
|
713
712
|
|
714
713
|
## Installation
|
715
714
|
|
716
|
-
###
|
715
|
+
### Recommended (GitHub, always latest):
|
717
716
|
|
718
717
|
```bash
|
719
718
|
pip install git+https://github.com/jefferythewind/warpgbm.git
|
@@ -721,7 +720,7 @@ pip install git+https://github.com/jefferythewind/warpgbm.git
|
|
721
720
|
|
722
721
|
This installs the latest version directly from GitHub and compiles CUDA extensions on your machine using your **local PyTorch and CUDA setup**. It's the most reliable method for ensuring compatibility and staying up to date with the latest features.
|
723
722
|
|
724
|
-
###
|
723
|
+
### Alternatively (PyPI, stable releases):
|
725
724
|
|
726
725
|
```bash
|
727
726
|
pip install warpgbm
|
@@ -729,7 +728,7 @@ pip install warpgbm
|
|
729
728
|
|
730
729
|
This installs from PyPI and also compiles CUDA code locally during installation. This method works well **if your environment already has PyTorch with GPU support** installed and configured.
|
731
730
|
|
732
|
-
>
|
731
|
+
> **Tip:**\
|
733
732
|
> If you encounter an error related to mismatched or missing CUDA versions, try installing with the following flag:
|
734
733
|
>
|
735
734
|
> ```bash
|
@@ -737,7 +736,7 @@ This installs from PyPI and also compiles CUDA code locally during installation.
|
|
737
736
|
> ```
|
738
737
|
|
739
738
|
Before either method, make sure you’ve installed PyTorch with GPU support:\
|
740
|
-
|
739
|
+
[https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)
|
741
740
|
|
742
741
|
---
|
743
742
|
|
@@ -774,7 +773,7 @@ print(f"LightGBM: corr = {np.corrcoef(lgb_preds, y)[0,1]:.4f}, time = {lgb_tim
|
|
774
773
|
print(f"WarpGBM: corr = {np.corrcoef(wgbm_preds, y)[0,1]:.4f}, time = {wgbm_time:.2f}s")
|
775
774
|
```
|
776
775
|
|
777
|
-
|
776
|
+
**Results (Ryzen 9 CPU, NVIDIA 3090 GPU):**
|
778
777
|
|
779
778
|
```
|
780
779
|
LightGBM: corr = 0.8742, time = 37.33s
|
@@ -824,6 +823,23 @@ print(f"LightGBM: corr = {np.corrcoef(lgb_preds, Y_np)[0,1]:.4f}, time = {lgb_
|
|
824
823
|
print(f"WarpGBM: corr = {np.corrcoef(wgbm_preds, Y_np)[0,1]:.4f}, time = {wgbm_time:.2f}s")
|
825
824
|
```
|
826
825
|
|
826
|
+
**Results (Google Colab Pro, A100 GPU):**
|
827
|
+
|
828
|
+
```
|
829
|
+
LightGBM: corr = 0.0703, time = 643.88s
|
830
|
+
WarpGBM: corr = 0.0660, time = 49.16s
|
831
|
+
```
|
832
|
+
|
833
|
+
---
|
834
|
+
|
835
|
+
### Run it live in Colab
|
836
|
+
|
837
|
+
You can try WarpGBM in a live Colab notebook using real pre-binned Numerai tournament data:
|
838
|
+
|
839
|
+
[Open in Colab](https://colab.research.google.com/drive/10mKSjs9UvmMgM5_lOXAylq5LUQAnNSi7?usp=sharing)
|
840
|
+
|
841
|
+
No installation required — just press **"Open in Playground"**, then **Run All**!
|
842
|
+
|
827
843
|
---
|
828
844
|
|
829
845
|
## Documentation
|
@@ -835,18 +851,15 @@ print(f"WarpGBM: corr = {np.corrcoef(wgbm_preds, Y_np)[0,1]:.4f}, time = {wg
|
|
835
851
|
- `n_estimators`: Number of boosting iterations (default: 100)
|
836
852
|
- `min_child_weight`: Minimum sum of instance weight needed in a child (default: 20)
|
837
853
|
- `min_split_gain`: Minimum loss reduction required to make a further partition (default: 0.0)
|
838
|
-
- `verbosity`: Whether to print training logs (default: True)
|
839
854
|
- `histogram_computer`: Choice of histogram kernel (`'hist1'`, `'hist2'`, `'hist3'`) (default: `'hist3'`)
|
840
855
|
- `threads_per_block`: CUDA threads per block (default: 32)
|
841
856
|
- `rows_per_thread`: Number of training rows processed per thread (default: 4)
|
842
|
-
- `
|
843
|
-
- `split_type`: Algorithm used to choose best split (`'v1'` = CUDA kernel, `'v2'` = torch-based) (default: `'v2'`)
|
857
|
+
- `L2_reg`: L2 regularizer (default: 1e-6)
|
844
858
|
|
845
859
|
### Methods:
|
846
860
|
- `.fit(X, y, era_id=None)`: Train the model. `X` can be raw floats or pre-binned `int8` data. `era_id` is optional and used internally.
|
847
|
-
- `.predict(X)`: Predict on new raw float or pre-binned data.
|
848
|
-
- `.
|
849
|
-
- `.grow_forest()`: Manually triggers tree construction loop (usually not needed).
|
861
|
+
- `.predict(X, chunksize=50_000)`: Predict on new raw float or pre-binned data.
|
862
|
+
- `.predict_numpy(X, chunksize=50_000)`: Same as `.predict(X)` but without using the GPU.
|
850
863
|
|
851
864
|
---
|
852
865
|
|
@@ -12,7 +12,6 @@ WarpGBM is a high-performance, GPU-accelerated Gradient Boosted Decision Tree (G
|
|
12
12
|
- GPU-accelerated training and histogram construction using custom CUDA kernels
|
13
13
|
- Drop-in scikit-learn style interface
|
14
14
|
- Supports pre-binned data or automatic quantile binning
|
15
|
-
- Fully differentiable prediction path
|
16
15
|
- Simple install with `pip`
|
17
16
|
|
18
17
|
---
|
@@ -25,7 +24,7 @@ In our initial tests on an NVIDIA 3090 (local) and A100 (Google Colab Pro), Warp
|
|
25
24
|
|
26
25
|
## Installation
|
27
26
|
|
28
|
-
###
|
27
|
+
### Recommended (GitHub, always latest):
|
29
28
|
|
30
29
|
```bash
|
31
30
|
pip install git+https://github.com/jefferythewind/warpgbm.git
|
@@ -33,7 +32,7 @@ pip install git+https://github.com/jefferythewind/warpgbm.git
|
|
33
32
|
|
34
33
|
This installs the latest version directly from GitHub and compiles CUDA extensions on your machine using your **local PyTorch and CUDA setup**. It's the most reliable method for ensuring compatibility and staying up to date with the latest features.
|
35
34
|
|
36
|
-
###
|
35
|
+
### Alternatively (PyPI, stable releases):
|
37
36
|
|
38
37
|
```bash
|
39
38
|
pip install warpgbm
|
@@ -41,7 +40,7 @@ pip install warpgbm
|
|
41
40
|
|
42
41
|
This installs from PyPI and also compiles CUDA code locally during installation. This method works well **if your environment already has PyTorch with GPU support** installed and configured.
|
43
42
|
|
44
|
-
>
|
43
|
+
> **Tip:**\
|
45
44
|
> If you encounter an error related to mismatched or missing CUDA versions, try installing with the following flag:
|
46
45
|
>
|
47
46
|
> ```bash
|
@@ -49,7 +48,7 @@ This installs from PyPI and also compiles CUDA code locally during installation.
|
|
49
48
|
> ```
|
50
49
|
|
51
50
|
Before either method, make sure you’ve installed PyTorch with GPU support:\
|
52
|
-
|
51
|
+
[https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)
|
53
52
|
|
54
53
|
---
|
55
54
|
|
@@ -86,7 +85,7 @@ print(f"LightGBM: corr = {np.corrcoef(lgb_preds, y)[0,1]:.4f}, time = {lgb_tim
|
|
86
85
|
print(f"WarpGBM: corr = {np.corrcoef(wgbm_preds, y)[0,1]:.4f}, time = {wgbm_time:.2f}s")
|
87
86
|
```
|
88
87
|
|
89
|
-
|
88
|
+
**Results (Ryzen 9 CPU, NVIDIA 3090 GPU):**
|
90
89
|
|
91
90
|
```
|
92
91
|
LightGBM: corr = 0.8742, time = 37.33s
|
@@ -136,6 +135,23 @@ print(f"LightGBM: corr = {np.corrcoef(lgb_preds, Y_np)[0,1]:.4f}, time = {lgb_
|
|
136
135
|
print(f"WarpGBM: corr = {np.corrcoef(wgbm_preds, Y_np)[0,1]:.4f}, time = {wgbm_time:.2f}s")
|
137
136
|
```
|
138
137
|
|
138
|
+
**Results (Google Colab Pro, A100 GPU):**
|
139
|
+
|
140
|
+
```
|
141
|
+
LightGBM: corr = 0.0703, time = 643.88s
|
142
|
+
WarpGBM: corr = 0.0660, time = 49.16s
|
143
|
+
```
|
144
|
+
|
145
|
+
---
|
146
|
+
|
147
|
+
### Run it live in Colab
|
148
|
+
|
149
|
+
You can try WarpGBM in a live Colab notebook using real pre-binned Numerai tournament data:
|
150
|
+
|
151
|
+
[Open in Colab](https://colab.research.google.com/drive/10mKSjs9UvmMgM5_lOXAylq5LUQAnNSi7?usp=sharing)
|
152
|
+
|
153
|
+
No installation required — just press **"Open in Playground"**, then **Run All**!
|
154
|
+
|
139
155
|
---
|
140
156
|
|
141
157
|
## Documentation
|
@@ -147,18 +163,15 @@ print(f"WarpGBM: corr = {np.corrcoef(wgbm_preds, Y_np)[0,1]:.4f}, time = {wg
|
|
147
163
|
- `n_estimators`: Number of boosting iterations (default: 100)
|
148
164
|
- `min_child_weight`: Minimum sum of instance weight needed in a child (default: 20)
|
149
165
|
- `min_split_gain`: Minimum loss reduction required to make a further partition (default: 0.0)
|
150
|
-
- `verbosity`: Whether to print training logs (default: True)
|
151
166
|
- `histogram_computer`: Choice of histogram kernel (`'hist1'`, `'hist2'`, `'hist3'`) (default: `'hist3'`)
|
152
167
|
- `threads_per_block`: CUDA threads per block (default: 32)
|
153
168
|
- `rows_per_thread`: Number of training rows processed per thread (default: 4)
|
154
|
-
- `
|
155
|
-
- `split_type`: Algorithm used to choose best split (`'v1'` = CUDA kernel, `'v2'` = torch-based) (default: `'v2'`)
|
169
|
+
- `L2_reg`: L2 regularizer (default: 1e-6)
|
156
170
|
|
157
171
|
### Methods:
|
158
172
|
- `.fit(X, y, era_id=None)`: Train the model. `X` can be raw floats or pre-binned `int8` data. `era_id` is optional and used internally.
|
159
|
-
- `.predict(X)`: Predict on new raw float or pre-binned data.
|
160
|
-
- `.
|
161
|
-
- `.grow_forest()`: Manually triggers tree construction loop (usually not needed).
|
173
|
+
- `.predict(X, chunksize=50_000)`: Predict on new raw float or pre-binned data.
|
174
|
+
- `.predict_numpy(X, chunksize=50_000)`: Same as `.predict(X)` but without using the GPU.
|
162
175
|
|
163
176
|
---
|
164
177
|
|
@@ -0,0 +1 @@
|
|
1
|
+
0.1.15
|
@@ -0,0 +1,491 @@
|
|
1
|
+
import torch
|
2
|
+
import numpy as np
|
3
|
+
from sklearn.base import BaseEstimator, RegressorMixin
|
4
|
+
from warpgbm.cuda import node_kernel
|
5
|
+
from tqdm import tqdm
|
6
|
+
|
7
|
+
histogram_kernels = {
|
8
|
+
'hist1': node_kernel.compute_histogram,
|
9
|
+
'hist2': node_kernel.compute_histogram2,
|
10
|
+
'hist3': node_kernel.compute_histogram3
|
11
|
+
}
|
12
|
+
|
13
|
+
class WarpGBM(BaseEstimator, RegressorMixin):
|
14
|
+
def __init__(
|
15
|
+
self,
|
16
|
+
num_bins=10,
|
17
|
+
max_depth=3,
|
18
|
+
learning_rate=0.1,
|
19
|
+
n_estimators=100,
|
20
|
+
min_child_weight=20,
|
21
|
+
min_split_gain=0.0,
|
22
|
+
verbosity=True,
|
23
|
+
histogram_computer='hist3',
|
24
|
+
threads_per_block=64,
|
25
|
+
rows_per_thread=4,
|
26
|
+
L2_reg = 1e-6,
|
27
|
+
device = 'cuda'
|
28
|
+
):
|
29
|
+
self.num_bins = num_bins
|
30
|
+
self.max_depth = max_depth
|
31
|
+
self.learning_rate = learning_rate
|
32
|
+
self.n_estimators = n_estimators
|
33
|
+
self.forest = None
|
34
|
+
self.bin_edges = None # shape: [num_features, num_bins-1] if using quantile binning
|
35
|
+
self.base_prediction = None
|
36
|
+
self.unique_eras = None
|
37
|
+
self.device = device
|
38
|
+
self.root_gradient_histogram = None
|
39
|
+
self.root_hessian_histogram = None
|
40
|
+
self.gradients = None
|
41
|
+
self.root_node_indices = None
|
42
|
+
self.bin_indices = None
|
43
|
+
self.Y_gpu = None
|
44
|
+
self.num_features = None
|
45
|
+
self.num_samples = None
|
46
|
+
self.out_feature = torch.zeros(1, device=self.device, dtype=torch.int32)
|
47
|
+
self.out_bin = torch.zeros(1, device=self.device, dtype=torch.int32)
|
48
|
+
self.min_child_weight = min_child_weight
|
49
|
+
self.min_split_gain = min_split_gain
|
50
|
+
self.best_gain = torch.tensor([-float('inf')], dtype=torch.float32, device=self.device)
|
51
|
+
self.best_feature = torch.tensor([-1], dtype=torch.int32, device=self.device)
|
52
|
+
self.best_bin = torch.tensor([-1], dtype=torch.int32, device=self.device)
|
53
|
+
self.compute_histogram = histogram_kernels[histogram_computer]
|
54
|
+
self.threads_per_block = threads_per_block
|
55
|
+
self.rows_per_thread = rows_per_thread
|
56
|
+
self.L2_reg = L2_reg
|
57
|
+
|
58
|
+
|
59
|
+
def fit(self, X, y, era_id=None):
|
60
|
+
if era_id is None:
|
61
|
+
era_id = np.ones(X.shape[0], dtype='int32')
|
62
|
+
self.bin_indices, era_indices, self.bin_edges, self.unique_eras, self.Y_gpu = self.preprocess_gpu_data(X, y, era_id)
|
63
|
+
self.num_samples, self.num_features = X.shape
|
64
|
+
self.gradients = torch.zeros_like(self.Y_gpu)
|
65
|
+
self.root_node_indices = torch.arange(self.num_samples, device=self.device)
|
66
|
+
self.base_prediction = self.Y_gpu.mean().item()
|
67
|
+
self.gradients += self.base_prediction
|
68
|
+
self.split_gains = torch.zeros((self.num_features, self.num_bins - 1), device=self.device)
|
69
|
+
self.forest = self.grow_forest()
|
70
|
+
return self
|
71
|
+
|
72
|
+
def compute_quantile_bins(self, X, num_bins):
|
73
|
+
quantiles = torch.linspace(0, 1, num_bins + 1)[1:-1] # exclude 0% and 100%
|
74
|
+
bin_edges = torch.quantile(X, quantiles, dim=0) # shape: [B-1, F]
|
75
|
+
return bin_edges.T # shape: [F, B-1]
|
76
|
+
|
77
|
+
def preprocess_gpu_data(self, X_np, Y_np, era_id_np):
|
78
|
+
self.num_samples, self.num_features = X_np.shape
|
79
|
+
Y_gpu = torch.from_numpy(Y_np).type(torch.float32).to(self.device)
|
80
|
+
era_id_gpu = torch.from_numpy(era_id_np).type(torch.int32).to(self.device)
|
81
|
+
is_integer_type = np.issubdtype(X_np.dtype, np.integer)
|
82
|
+
if is_integer_type:
|
83
|
+
max_vals = X_np.max(axis=0)
|
84
|
+
if np.all(max_vals < self.num_bins):
|
85
|
+
print("Detected pre-binned integer input — skipping quantile binning.")
|
86
|
+
bin_indices = torch.from_numpy(X_np).to(self.device).contiguous().to(torch.int8)
|
87
|
+
|
88
|
+
# We'll store None or an empty tensor in self.bin_edges
|
89
|
+
# to indicate that we skip binning at predict-time
|
90
|
+
bin_edges = torch.arange(1, self.num_bins, dtype=torch.float32).repeat(self.num_features, 1)
|
91
|
+
bin_edges = bin_edges.to(self.device)
|
92
|
+
unique_eras, era_indices = torch.unique(era_id_gpu, return_inverse=True)
|
93
|
+
return bin_indices, era_indices, bin_edges, unique_eras, Y_gpu
|
94
|
+
else:
|
95
|
+
print("Integer input detected, but values exceed num_bins — falling back to quantile binning.")
|
96
|
+
|
97
|
+
print("Performing quantile binning on CPU...")
|
98
|
+
X_cpu = torch.from_numpy(X_np).type(torch.float32) # CPU tensor
|
99
|
+
bin_edges_cpu = self.compute_quantile_bins(X_cpu, self.num_bins).type(torch.float32).contiguous()
|
100
|
+
bin_indices_cpu = torch.empty((self.num_samples, self.num_features), dtype=torch.int8)
|
101
|
+
for f in range(self.num_features):
|
102
|
+
bin_indices_cpu[:, f] = torch.bucketize(X_cpu[:, f], bin_edges_cpu[f], right=False).type(torch.int8)
|
103
|
+
bin_indices = bin_indices_cpu.to(self.device).contiguous()
|
104
|
+
bin_edges = bin_edges_cpu.to(self.device)
|
105
|
+
unique_eras, era_indices = torch.unique(era_id_gpu, return_inverse=True)
|
106
|
+
return bin_indices, era_indices, bin_edges, unique_eras, Y_gpu
|
107
|
+
|
108
|
+
def compute_histograms(self, bin_indices_sub, gradients):
|
109
|
+
grad_hist = torch.zeros((self.num_features, self.num_bins), device=self.device, dtype=torch.float32)
|
110
|
+
hess_hist = torch.zeros((self.num_features, self.num_bins), device=self.device, dtype=torch.float32)
|
111
|
+
|
112
|
+
self.compute_histogram(
|
113
|
+
bin_indices_sub,
|
114
|
+
gradients,
|
115
|
+
grad_hist,
|
116
|
+
hess_hist,
|
117
|
+
self.num_bins,
|
118
|
+
self.threads_per_block,
|
119
|
+
self.rows_per_thread
|
120
|
+
)
|
121
|
+
return grad_hist, hess_hist
|
122
|
+
|
123
|
+
def find_best_split(self, gradient_histogram, hessian_histogram):
|
124
|
+
node_kernel.compute_split(
|
125
|
+
gradient_histogram.contiguous(),
|
126
|
+
hessian_histogram.contiguous(),
|
127
|
+
self.num_features,
|
128
|
+
self.num_bins,
|
129
|
+
self.min_split_gain,
|
130
|
+
self.min_child_weight,
|
131
|
+
self.L2_reg,
|
132
|
+
self.out_feature,
|
133
|
+
self.out_bin
|
134
|
+
)
|
135
|
+
|
136
|
+
f = int(self.out_feature[0])
|
137
|
+
b = int(self.out_bin[0])
|
138
|
+
return (f, b)
|
139
|
+
|
140
|
+
def grow_tree(self, gradient_histogram, hessian_histogram, node_indices, depth):
|
141
|
+
if depth == self.max_depth:
|
142
|
+
leaf_value = self.residual[node_indices].mean()
|
143
|
+
self.gradients[node_indices] += self.learning_rate * leaf_value
|
144
|
+
return {"leaf_value": leaf_value.item(), "samples": node_indices.numel()}
|
145
|
+
|
146
|
+
parent_size = node_indices.numel()
|
147
|
+
best_feature, best_bin = self.find_best_split(gradient_histogram, hessian_histogram)
|
148
|
+
|
149
|
+
if best_feature == -1:
|
150
|
+
leaf_value = self.residual[node_indices].mean()
|
151
|
+
self.gradients[node_indices] += self.learning_rate * leaf_value
|
152
|
+
return {"leaf_value": leaf_value.item(), "samples": parent_size}
|
153
|
+
|
154
|
+
split_mask = (self.bin_indices[node_indices, best_feature] <= best_bin)
|
155
|
+
left_indices = node_indices[split_mask]
|
156
|
+
right_indices = node_indices[~split_mask]
|
157
|
+
|
158
|
+
left_size = left_indices.numel()
|
159
|
+
right_size = right_indices.numel()
|
160
|
+
|
161
|
+
if left_size == 0 or right_size == 0:
|
162
|
+
leaf_value = self.residual[node_indices].mean()
|
163
|
+
self.gradients[node_indices] += self.learning_rate * leaf_value
|
164
|
+
return {"leaf_value": leaf_value.item(), "samples": parent_size}
|
165
|
+
|
166
|
+
if left_size <= right_size:
|
167
|
+
grad_hist_left, hess_hist_left = self.compute_histograms( self.bin_indices[left_indices], self.residual[left_indices] )
|
168
|
+
grad_hist_right = gradient_histogram - grad_hist_left
|
169
|
+
hess_hist_right = hessian_histogram - hess_hist_left
|
170
|
+
else:
|
171
|
+
grad_hist_right, hess_hist_right = self.compute_histograms( self.bin_indices[right_indices], self.residual[right_indices] )
|
172
|
+
grad_hist_left = gradient_histogram - grad_hist_right
|
173
|
+
hess_hist_left = hessian_histogram - hess_hist_right
|
174
|
+
|
175
|
+
new_depth = depth + 1
|
176
|
+
left_child = self.grow_tree(grad_hist_left, hess_hist_left, left_indices, new_depth)
|
177
|
+
right_child = self.grow_tree(grad_hist_right, hess_hist_right, right_indices, new_depth)
|
178
|
+
|
179
|
+
return { "feature": best_feature, "bin": best_bin, "left": left_child, "right": right_child }
|
180
|
+
|
181
|
+
def grow_forest(self):
|
182
|
+
forest = [{} for _ in range(self.n_estimators)]
|
183
|
+
self.training_loss = []
|
184
|
+
|
185
|
+
for i in range(self.n_estimators):
|
186
|
+
self.residual = self.Y_gpu - self.gradients
|
187
|
+
|
188
|
+
self.root_gradient_histogram, self.root_hessian_histogram = \
|
189
|
+
self.compute_histograms(self.bin_indices, self.residual)
|
190
|
+
|
191
|
+
tree = self.grow_tree(
|
192
|
+
self.root_gradient_histogram,
|
193
|
+
self.root_hessian_histogram,
|
194
|
+
self.root_node_indices,
|
195
|
+
depth=0
|
196
|
+
)
|
197
|
+
forest[i] = tree
|
198
|
+
loss = ((self.Y_gpu - self.gradients) ** 2).mean().item()
|
199
|
+
self.training_loss.append(loss)
|
200
|
+
# print(f"🌲 Tree {i+1}/{self.n_estimators} - MSE: {loss:.6f}")
|
201
|
+
|
202
|
+
print("Finished training forest.")
|
203
|
+
return forest
|
204
|
+
|
205
|
+
def predict(self, X_np, chunk_size=50000):
|
206
|
+
"""
|
207
|
+
Vectorized predict using a padded layer-by-layer approach.
|
208
|
+
We assume `flatten_forest_to_tensors` has produced self.flat_forest with
|
209
|
+
"features", "thresholds", "leaf_values", all shaped [n_trees, max_nodes].
|
210
|
+
"""
|
211
|
+
# 1) Convert X_np -> bin_indices
|
212
|
+
is_integer_type = np.issubdtype(X_np.dtype, np.integer)
|
213
|
+
if is_integer_type:
|
214
|
+
max_vals = X_np.max(axis=0)
|
215
|
+
if np.all(max_vals < self.num_bins):
|
216
|
+
bin_indices = X_np.astype(np.int8)
|
217
|
+
else:
|
218
|
+
raise ValueError("Pre-binned integers must be < num_bins")
|
219
|
+
else:
|
220
|
+
X_cpu = torch.from_numpy(X_np).type(torch.float32)
|
221
|
+
bin_indices = torch.empty((X_np.shape[0], X_np.shape[1]), dtype=torch.int8)
|
222
|
+
bin_edges_cpu = self.bin_edges.to('cpu')
|
223
|
+
for f in range(self.num_features):
|
224
|
+
bin_indices[:, f] = torch.bucketize(X_cpu[:, f], bin_edges_cpu[f], right=False).type(torch.int8)
|
225
|
+
bin_indices = bin_indices.numpy()
|
226
|
+
|
227
|
+
# 2) Ensure we have a padded representation
|
228
|
+
self.flat_forest = self.flatten_forest_to_tensors(self.forest)
|
229
|
+
|
230
|
+
features_t = self.flat_forest["features"] # [n_trees, max_nodes], int16
|
231
|
+
thresholds_t = self.flat_forest["thresholds"] # [n_trees, max_nodes], int16
|
232
|
+
values_t = self.flat_forest["leaf_values"] # [n_trees, max_nodes], float32
|
233
|
+
max_nodes = self.flat_forest["max_nodes"]
|
234
|
+
|
235
|
+
n_trees = features_t.shape[0]
|
236
|
+
N = bin_indices.shape[0]
|
237
|
+
out = np.zeros(N, dtype=np.float32)
|
238
|
+
|
239
|
+
# 3) Process rows in chunks
|
240
|
+
for start in tqdm(range(0, N, chunk_size)):
|
241
|
+
end = min(start + chunk_size, N)
|
242
|
+
chunk_np = bin_indices[start:end] # shape [chunk_size, F]
|
243
|
+
chunk_gpu = torch.from_numpy(chunk_np).to(self.device) # [chunk_size, F], int8
|
244
|
+
|
245
|
+
# Accumulate raw (unscaled) leaf sums
|
246
|
+
chunk_preds = torch.zeros((end - start,), dtype=torch.float32, device=self.device)
|
247
|
+
|
248
|
+
# node_idx[i] tracks the current node index in the padded tree for row i
|
249
|
+
node_idx = torch.zeros((end - start,), dtype=torch.int32, device=self.device)
|
250
|
+
|
251
|
+
# 'active' is a boolean mask over [0..(end-start-1)], indicating which rows haven't reached a leaf
|
252
|
+
active = torch.ones((end - start,), dtype=torch.bool, device=self.device)
|
253
|
+
|
254
|
+
for t in range(n_trees):
|
255
|
+
# Reset for each tree (each tree is independent)
|
256
|
+
node_idx.fill_(0)
|
257
|
+
active.fill_(True)
|
258
|
+
|
259
|
+
tree_features = features_t[t] # shape [max_nodes], int16
|
260
|
+
tree_thresh = thresholds_t[t] # shape [max_nodes], int16
|
261
|
+
tree_values = values_t[t] # shape [max_nodes], float32
|
262
|
+
|
263
|
+
# Up to self.max_depth+1 layers
|
264
|
+
for _level in range(self.max_depth + 1):
|
265
|
+
active_idx = active.nonzero(as_tuple=True)[0]
|
266
|
+
if active_idx.numel() == 0:
|
267
|
+
break # all rows are done in this tree
|
268
|
+
|
269
|
+
current_node_idx = node_idx[active_idx]
|
270
|
+
f = tree_features[current_node_idx] # shape [#active], int16
|
271
|
+
thr = tree_thresh[current_node_idx] # shape [#active], int16
|
272
|
+
vals = tree_values[current_node_idx] # shape [#active], float32
|
273
|
+
|
274
|
+
mask_no_node = (f == -2)
|
275
|
+
mask_leaf = (f == -1)
|
276
|
+
|
277
|
+
# If leaf, add leaf value and mark inactive.
|
278
|
+
if mask_leaf.any():
|
279
|
+
leaf_rows = active_idx[mask_leaf]
|
280
|
+
chunk_preds[leaf_rows] += vals[mask_leaf]
|
281
|
+
active[leaf_rows] = False
|
282
|
+
|
283
|
+
# If no node, mark inactive.
|
284
|
+
if mask_no_node.any():
|
285
|
+
no_node_rows = active_idx[mask_no_node]
|
286
|
+
active[no_node_rows] = False
|
287
|
+
|
288
|
+
# For internal nodes, perform bin comparison.
|
289
|
+
mask_internal = (~mask_leaf & ~mask_no_node)
|
290
|
+
if mask_internal.any():
|
291
|
+
internal_rows = active_idx[mask_internal]
|
292
|
+
act_f = f[mask_internal].long()
|
293
|
+
act_thr = thr[mask_internal]
|
294
|
+
binvals = chunk_gpu[internal_rows, act_f]
|
295
|
+
go_left = (binvals <= act_thr)
|
296
|
+
new_left_idx = current_node_idx[mask_internal] * 2 + 1
|
297
|
+
new_right_idx = current_node_idx[mask_internal] * 2 + 2
|
298
|
+
node_idx[internal_rows[go_left]] = new_left_idx[go_left]
|
299
|
+
node_idx[internal_rows[~go_left]] = new_right_idx[~go_left]
|
300
|
+
# end per-tree layer loop
|
301
|
+
# end for each tree
|
302
|
+
|
303
|
+
out[start:end] = (
|
304
|
+
self.base_prediction + self.learning_rate * chunk_preds
|
305
|
+
).cpu().numpy()
|
306
|
+
|
307
|
+
return out
|
308
|
+
|
309
|
+
def flatten_forest_to_tensors(self, forest):
|
310
|
+
"""
|
311
|
+
Convert a list of dict-based trees into a fixed-size array representation
|
312
|
+
for each tree, up to max_depth. Each tree is stored in a 'perfect binary tree'
|
313
|
+
layout:
|
314
|
+
- node 0 is the root
|
315
|
+
- node i has children (2*i + 1) and (2*i + 2), if within range
|
316
|
+
- feature = -2 indicates no node / invalid
|
317
|
+
- feature = -1 indicates a leaf node
|
318
|
+
- otherwise, an internal node with that feature.
|
319
|
+
"""
|
320
|
+
n_trees = len(forest)
|
321
|
+
max_nodes = 2 ** (self.max_depth + 1) - 1 # total array slots per tree
|
322
|
+
|
323
|
+
# Allocate padded arrays (on CPU for ease of indexing).
|
324
|
+
feat_arr = np.full((n_trees, max_nodes), -2, dtype=np.int16)
|
325
|
+
thresh_arr = np.full((n_trees, max_nodes), -2, dtype=np.int16)
|
326
|
+
value_arr = np.zeros((n_trees, max_nodes), dtype=np.float32)
|
327
|
+
|
328
|
+
def fill_padded(tree, tree_idx, node_idx, depth):
|
329
|
+
"""
|
330
|
+
Recursively fill feat_arr, thresh_arr, value_arr for a single tree.
|
331
|
+
If depth == self.max_depth, no children are added.
|
332
|
+
If there's no node, feature remains -2.
|
333
|
+
"""
|
334
|
+
if "leaf_value" in tree:
|
335
|
+
feat_arr[tree_idx, node_idx] = -1
|
336
|
+
thresh_arr[tree_idx, node_idx] = -1
|
337
|
+
value_arr[tree_idx, node_idx] = tree["leaf_value"]
|
338
|
+
return
|
339
|
+
|
340
|
+
feat = tree["feature"]
|
341
|
+
bin_th = tree["bin"]
|
342
|
+
|
343
|
+
feat_arr[tree_idx, node_idx] = feat
|
344
|
+
thresh_arr[tree_idx, node_idx] = bin_th
|
345
|
+
# Internal nodes keep a 0 value.
|
346
|
+
|
347
|
+
if depth < self.max_depth:
|
348
|
+
left_idx = 2 * node_idx + 1
|
349
|
+
right_idx = 2 * node_idx + 2
|
350
|
+
fill_padded(tree["left"], tree_idx, left_idx, depth + 1)
|
351
|
+
fill_padded(tree["right"], tree_idx, right_idx, depth + 1)
|
352
|
+
# At max depth, children remain unfilled (-2).
|
353
|
+
|
354
|
+
for t, root in enumerate(forest):
|
355
|
+
fill_padded(root, t, 0, 0)
|
356
|
+
|
357
|
+
# Convert to torch Tensors on the proper device.
|
358
|
+
features_t = torch.from_numpy(feat_arr).to(self.device)
|
359
|
+
thresholds_t = torch.from_numpy(thresh_arr).to(self.device)
|
360
|
+
leaf_values_t = torch.from_numpy(value_arr).to(self.device)
|
361
|
+
|
362
|
+
return {
|
363
|
+
"features": features_t, # [n_trees, max_nodes]
|
364
|
+
"thresholds": thresholds_t, # [n_trees, max_nodes]
|
365
|
+
"leaf_values": leaf_values_t, # [n_trees, max_nodes]
|
366
|
+
"max_nodes": max_nodes
|
367
|
+
}
|
368
|
+
|
369
|
+
def predict_numpy(self, X_np, chunk_size=50000):
|
370
|
+
"""
|
371
|
+
Fully NumPy-based version of predict_fast.
|
372
|
+
Assumes flatten_forest_to_tensors has been called and `self.flat_forest` is ready.
|
373
|
+
"""
|
374
|
+
# 1) Convert X_np -> bin_indices
|
375
|
+
is_integer_type = np.issubdtype(X_np.dtype, np.integer)
|
376
|
+
if is_integer_type:
|
377
|
+
max_vals = X_np.max(axis=0)
|
378
|
+
if np.all(max_vals < self.num_bins):
|
379
|
+
bin_indices = X_np.astype(np.int8)
|
380
|
+
else:
|
381
|
+
raise ValueError("Pre-binned integers must be < num_bins")
|
382
|
+
else:
|
383
|
+
bin_indices = np.empty_like(X_np, dtype=np.int8)
|
384
|
+
# Ensure bin_edges are NumPy arrays
|
385
|
+
if isinstance(self.bin_edges[0], torch.Tensor):
|
386
|
+
bin_edges_np = [be.cpu().numpy() for be in self.bin_edges]
|
387
|
+
else:
|
388
|
+
bin_edges_np = self.bin_edges
|
389
|
+
|
390
|
+
for f in range(self.num_features):
|
391
|
+
bin_indices[:, f] = np.searchsorted(bin_edges_np[f], X_np[:, f], side='left')
|
392
|
+
|
393
|
+
# Ensure we have a padded representation
|
394
|
+
self.flat_forest = self.flatten_forest(self.forest)
|
395
|
+
|
396
|
+
# 2) Padded forest arrays (already NumPy now)
|
397
|
+
features_t = self.flat_forest["features"] # [n_trees, max_nodes], int16
|
398
|
+
thresholds_t = self.flat_forest["thresholds"] # [n_trees, max_nodes], int16
|
399
|
+
values_t = self.flat_forest["leaf_values"] # [n_trees, max_nodes], float32
|
400
|
+
max_nodes = self.flat_forest["max_nodes"]
|
401
|
+
n_trees = features_t.shape[0]
|
402
|
+
N = bin_indices.shape[0]
|
403
|
+
out = np.zeros(N, dtype=np.float32)
|
404
|
+
|
405
|
+
# 3) Process in chunks
|
406
|
+
for start in tqdm( range(0, N, chunk_size) ):
|
407
|
+
end = min(start + chunk_size, N)
|
408
|
+
chunk = bin_indices[start:end] # [chunk_size, F]
|
409
|
+
chunk_preds = np.zeros(end - start, dtype=np.float32)
|
410
|
+
|
411
|
+
for t in range(n_trees):
|
412
|
+
node_idx = np.zeros(end - start, dtype=np.int32)
|
413
|
+
active = np.ones(end - start, dtype=bool)
|
414
|
+
|
415
|
+
tree_features = features_t[t] # [max_nodes]
|
416
|
+
tree_thresh = thresholds_t[t] # [max_nodes]
|
417
|
+
tree_values = values_t[t] # [max_nodes]
|
418
|
+
|
419
|
+
for _level in range(self.max_depth + 1):
|
420
|
+
active_idx = np.nonzero(active)[0]
|
421
|
+
if active_idx.size == 0:
|
422
|
+
break
|
423
|
+
|
424
|
+
current_node_idx = node_idx[active_idx]
|
425
|
+
f = tree_features[current_node_idx]
|
426
|
+
thr = tree_thresh[current_node_idx]
|
427
|
+
vals = tree_values[current_node_idx]
|
428
|
+
|
429
|
+
mask_no_node = (f == -2)
|
430
|
+
mask_leaf = (f == -1)
|
431
|
+
mask_internal = ~(mask_leaf | mask_no_node)
|
432
|
+
|
433
|
+
if np.any(mask_leaf):
|
434
|
+
leaf_rows = active_idx[mask_leaf]
|
435
|
+
chunk_preds[leaf_rows] += vals[mask_leaf]
|
436
|
+
active[leaf_rows] = False
|
437
|
+
|
438
|
+
if np.any(mask_no_node):
|
439
|
+
no_node_rows = active_idx[mask_no_node]
|
440
|
+
active[no_node_rows] = False
|
441
|
+
|
442
|
+
if np.any(mask_internal):
|
443
|
+
internal_rows = active_idx[mask_internal]
|
444
|
+
act_f = f[mask_internal].astype(np.int32)
|
445
|
+
act_thr = thr[mask_internal]
|
446
|
+
binvals = chunk[internal_rows, act_f]
|
447
|
+
go_left = binvals <= act_thr
|
448
|
+
|
449
|
+
new_left_idx = current_node_idx[mask_internal] * 2 + 1
|
450
|
+
new_right_idx = current_node_idx[mask_internal] * 2 + 2
|
451
|
+
node_idx[internal_rows[go_left]] = new_left_idx[go_left]
|
452
|
+
node_idx[internal_rows[~go_left]] = new_right_idx[~go_left]
|
453
|
+
|
454
|
+
out[start:end] = self.base_prediction + self.learning_rate * chunk_preds
|
455
|
+
|
456
|
+
return out
|
457
|
+
|
458
|
+
def flatten_forest(self, forest):
|
459
|
+
n_trees = len(forest)
|
460
|
+
max_nodes = 2 ** (self.max_depth + 1) - 1
|
461
|
+
|
462
|
+
feat_arr = np.full((n_trees, max_nodes), -2, dtype=np.int16)
|
463
|
+
thresh_arr = np.full((n_trees, max_nodes), -2, dtype=np.int16)
|
464
|
+
value_arr = np.zeros((n_trees, max_nodes), dtype=np.float32)
|
465
|
+
|
466
|
+
def fill_padded(tree, tree_idx, node_idx, depth):
|
467
|
+
if "leaf_value" in tree:
|
468
|
+
feat_arr[tree_idx, node_idx] = -1
|
469
|
+
thresh_arr[tree_idx, node_idx] = -1
|
470
|
+
value_arr[tree_idx, node_idx] = tree["leaf_value"]
|
471
|
+
return
|
472
|
+
feat = tree["feature"]
|
473
|
+
bin_th = tree["bin"]
|
474
|
+
feat_arr[tree_idx, node_idx] = feat
|
475
|
+
thresh_arr[tree_idx, node_idx] = bin_th
|
476
|
+
|
477
|
+
if depth < self.max_depth:
|
478
|
+
left_idx = 2 * node_idx + 1
|
479
|
+
right_idx = 2 * node_idx + 2
|
480
|
+
fill_padded(tree["left"], tree_idx, left_idx, depth + 1)
|
481
|
+
fill_padded(tree["right"], tree_idx, right_idx, depth + 1)
|
482
|
+
|
483
|
+
for t, root in enumerate(forest):
|
484
|
+
fill_padded(root, t, 0, 0)
|
485
|
+
|
486
|
+
return {
|
487
|
+
"features": feat_arr,
|
488
|
+
"thresholds": thresh_arr,
|
489
|
+
"leaf_values": value_arr,
|
490
|
+
"max_nodes": max_nodes
|
491
|
+
}
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: warpgbm
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.15
|
4
4
|
Summary: A fast GPU-accelerated Gradient Boosted Decision Tree library with PyTorch + CUDA
|
5
5
|
License: GNU GENERAL PUBLIC LICENSE
|
6
6
|
Version 3, 29 June 2007
|
@@ -700,7 +700,6 @@ WarpGBM is a high-performance, GPU-accelerated Gradient Boosted Decision Tree (G
|
|
700
700
|
- GPU-accelerated training and histogram construction using custom CUDA kernels
|
701
701
|
- Drop-in scikit-learn style interface
|
702
702
|
- Supports pre-binned data or automatic quantile binning
|
703
|
-
- Fully differentiable prediction path
|
704
703
|
- Simple install with `pip`
|
705
704
|
|
706
705
|
---
|
@@ -713,7 +712,7 @@ In our initial tests on an NVIDIA 3090 (local) and A100 (Google Colab Pro), Warp
|
|
713
712
|
|
714
713
|
## Installation
|
715
714
|
|
716
|
-
###
|
715
|
+
### Recommended (GitHub, always latest):
|
717
716
|
|
718
717
|
```bash
|
719
718
|
pip install git+https://github.com/jefferythewind/warpgbm.git
|
@@ -721,7 +720,7 @@ pip install git+https://github.com/jefferythewind/warpgbm.git
|
|
721
720
|
|
722
721
|
This installs the latest version directly from GitHub and compiles CUDA extensions on your machine using your **local PyTorch and CUDA setup**. It's the most reliable method for ensuring compatibility and staying up to date with the latest features.
|
723
722
|
|
724
|
-
###
|
723
|
+
### Alternatively (PyPI, stable releases):
|
725
724
|
|
726
725
|
```bash
|
727
726
|
pip install warpgbm
|
@@ -729,7 +728,7 @@ pip install warpgbm
|
|
729
728
|
|
730
729
|
This installs from PyPI and also compiles CUDA code locally during installation. This method works well **if your environment already has PyTorch with GPU support** installed and configured.
|
731
730
|
|
732
|
-
>
|
731
|
+
> **Tip:**\
|
733
732
|
> If you encounter an error related to mismatched or missing CUDA versions, try installing with the following flag:
|
734
733
|
>
|
735
734
|
> ```bash
|
@@ -737,7 +736,7 @@ This installs from PyPI and also compiles CUDA code locally during installation.
|
|
737
736
|
> ```
|
738
737
|
|
739
738
|
Before either method, make sure you’ve installed PyTorch with GPU support:\
|
740
|
-
|
739
|
+
[https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)
|
741
740
|
|
742
741
|
---
|
743
742
|
|
@@ -774,7 +773,7 @@ print(f"LightGBM: corr = {np.corrcoef(lgb_preds, y)[0,1]:.4f}, time = {lgb_tim
|
|
774
773
|
print(f"WarpGBM: corr = {np.corrcoef(wgbm_preds, y)[0,1]:.4f}, time = {wgbm_time:.2f}s")
|
775
774
|
```
|
776
775
|
|
777
|
-
|
776
|
+
**Results (Ryzen 9 CPU, NVIDIA 3090 GPU):**
|
778
777
|
|
779
778
|
```
|
780
779
|
LightGBM: corr = 0.8742, time = 37.33s
|
@@ -824,6 +823,23 @@ print(f"LightGBM: corr = {np.corrcoef(lgb_preds, Y_np)[0,1]:.4f}, time = {lgb_
|
|
824
823
|
print(f"WarpGBM: corr = {np.corrcoef(wgbm_preds, Y_np)[0,1]:.4f}, time = {wgbm_time:.2f}s")
|
825
824
|
```
|
826
825
|
|
826
|
+
**Results (Google Colab Pro, A100 GPU):**
|
827
|
+
|
828
|
+
```
|
829
|
+
LightGBM: corr = 0.0703, time = 643.88s
|
830
|
+
WarpGBM: corr = 0.0660, time = 49.16s
|
831
|
+
```
|
832
|
+
|
833
|
+
---
|
834
|
+
|
835
|
+
### Run it live in Colab
|
836
|
+
|
837
|
+
You can try WarpGBM in a live Colab notebook using real pre-binned Numerai tournament data:
|
838
|
+
|
839
|
+
[Open in Colab](https://colab.research.google.com/drive/10mKSjs9UvmMgM5_lOXAylq5LUQAnNSi7?usp=sharing)
|
840
|
+
|
841
|
+
No installation required — just press **"Open in Playground"**, then **Run All**!
|
842
|
+
|
827
843
|
---
|
828
844
|
|
829
845
|
## Documentation
|
@@ -835,18 +851,15 @@ print(f"WarpGBM: corr = {np.corrcoef(wgbm_preds, Y_np)[0,1]:.4f}, time = {wg
|
|
835
851
|
- `n_estimators`: Number of boosting iterations (default: 100)
|
836
852
|
- `min_child_weight`: Minimum sum of instance weight needed in a child (default: 20)
|
837
853
|
- `min_split_gain`: Minimum loss reduction required to make a further partition (default: 0.0)
|
838
|
-
- `verbosity`: Whether to print training logs (default: True)
|
839
854
|
- `histogram_computer`: Choice of histogram kernel (`'hist1'`, `'hist2'`, `'hist3'`) (default: `'hist3'`)
|
840
855
|
- `threads_per_block`: CUDA threads per block (default: 32)
|
841
856
|
- `rows_per_thread`: Number of training rows processed per thread (default: 4)
|
842
|
-
- `
|
843
|
-
- `split_type`: Algorithm used to choose best split (`'v1'` = CUDA kernel, `'v2'` = torch-based) (default: `'v2'`)
|
857
|
+
- `L2_reg`: L2 regularizer (default: 1e-6)
|
844
858
|
|
845
859
|
### Methods:
|
846
860
|
- `.fit(X, y, era_id=None)`: Train the model. `X` can be raw floats or pre-binned `int8` data. `era_id` is optional and used internally.
|
847
|
-
- `.predict(X)`: Predict on new raw float or pre-binned data.
|
848
|
-
- `.
|
849
|
-
- `.grow_forest()`: Manually triggers tree construction loop (usually not needed).
|
861
|
+
- `.predict(X, chunksize=50_000)`: Predict on new raw float or pre-binned data.
|
862
|
+
- `.predict_numpy(X, chunksize=50_000)`: Same as `.predict(X)` but without using the GPU.
|
850
863
|
|
851
864
|
---
|
852
865
|
|
warpgbm-0.1.13/version.txt
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
0.1.13
|
warpgbm-0.1.13/warpgbm/core.py
DELETED
@@ -1,241 +0,0 @@
|
|
1
|
-
import torch
|
2
|
-
import numpy as np
|
3
|
-
from sklearn.base import BaseEstimator, RegressorMixin
|
4
|
-
from warpgbm.cuda import node_kernel
|
5
|
-
from tqdm import tqdm
|
6
|
-
|
7
|
-
histogram_kernels = {
|
8
|
-
'hist1': node_kernel.compute_histogram,
|
9
|
-
'hist2': node_kernel.compute_histogram2,
|
10
|
-
'hist3': node_kernel.compute_histogram3
|
11
|
-
}
|
12
|
-
|
13
|
-
class WarpGBM(BaseEstimator, RegressorMixin):
|
14
|
-
def __init__(
|
15
|
-
self,
|
16
|
-
num_bins=10,
|
17
|
-
max_depth=3,
|
18
|
-
learning_rate=0.1,
|
19
|
-
n_estimators=100,
|
20
|
-
min_child_weight=20,
|
21
|
-
min_split_gain=0.0,
|
22
|
-
verbosity=True,
|
23
|
-
histogram_computer='hist3',
|
24
|
-
threads_per_block=64,
|
25
|
-
rows_per_thread=4,
|
26
|
-
L2_reg = 1e-6,
|
27
|
-
device = 'cuda'
|
28
|
-
):
|
29
|
-
self.num_bins = num_bins
|
30
|
-
self.max_depth = max_depth
|
31
|
-
self.learning_rate = learning_rate
|
32
|
-
self.n_estimators = n_estimators
|
33
|
-
self.forest = None
|
34
|
-
self.bin_edges = None # shape: [num_features, num_bins-1] if using quantile binning
|
35
|
-
self.base_prediction = None
|
36
|
-
self.unique_eras = None
|
37
|
-
self.device = device
|
38
|
-
self.root_gradient_histogram = None
|
39
|
-
self.root_hessian_histogram = None
|
40
|
-
self.gradients = None
|
41
|
-
self.root_node_indices = None
|
42
|
-
self.bin_indices = None
|
43
|
-
self.Y_gpu = None
|
44
|
-
self.num_features = None
|
45
|
-
self.num_samples = None
|
46
|
-
self.out_feature = torch.zeros(1, device=self.device, dtype=torch.int32)
|
47
|
-
self.out_bin = torch.zeros(1, device=self.device, dtype=torch.int32)
|
48
|
-
self.min_child_weight = min_child_weight
|
49
|
-
self.min_split_gain = min_split_gain
|
50
|
-
self.best_gain = torch.tensor([-float('inf')], dtype=torch.float32, device=self.device)
|
51
|
-
self.best_feature = torch.tensor([-1], dtype=torch.int32, device=self.device)
|
52
|
-
self.best_bin = torch.tensor([-1], dtype=torch.int32, device=self.device)
|
53
|
-
self.compute_histogram = histogram_kernels[histogram_computer]
|
54
|
-
self.threads_per_block = threads_per_block
|
55
|
-
self.rows_per_thread = rows_per_thread
|
56
|
-
self.L2_reg = L2_reg
|
57
|
-
|
58
|
-
|
59
|
-
def fit(self, X, y, era_id=None):
|
60
|
-
if era_id is None:
|
61
|
-
era_id = np.ones(X.shape[0], dtype='int32')
|
62
|
-
self.bin_indices, era_indices, self.bin_edges, self.unique_eras, self.Y_gpu = self.preprocess_gpu_data(X, y, era_id)
|
63
|
-
self.num_samples, self.num_features = X.shape
|
64
|
-
self.gradients = torch.zeros_like(self.Y_gpu)
|
65
|
-
self.root_node_indices = torch.arange(self.num_samples, device=self.device)
|
66
|
-
self.base_prediction = self.Y_gpu.mean().item()
|
67
|
-
self.gradients += self.base_prediction
|
68
|
-
self.split_gains = torch.zeros((self.num_features, self.num_bins - 1), device=self.device)
|
69
|
-
self.forest = self.grow_forest()
|
70
|
-
return self
|
71
|
-
|
72
|
-
def compute_quantile_bins(self, X, num_bins):
|
73
|
-
quantiles = torch.linspace(0, 1, num_bins + 1)[1:-1] # exclude 0% and 100%
|
74
|
-
bin_edges = torch.quantile(X, quantiles, dim=0) # shape: [B-1, F]
|
75
|
-
return bin_edges.T # shape: [F, B-1]
|
76
|
-
|
77
|
-
def preprocess_gpu_data(self, X_np, Y_np, era_id_np):
|
78
|
-
self.num_samples, self.num_features = X_np.shape
|
79
|
-
Y_gpu = torch.from_numpy(Y_np).type(torch.float32).to(self.device)
|
80
|
-
era_id_gpu = torch.from_numpy(era_id_np).type(torch.int32).to(self.device)
|
81
|
-
is_integer_type = np.issubdtype(X_np.dtype, np.integer)
|
82
|
-
if is_integer_type:
|
83
|
-
max_vals = X_np.max(axis=0)
|
84
|
-
if np.all(max_vals < self.num_bins):
|
85
|
-
print("Detected pre-binned integer input — skipping quantile binning.")
|
86
|
-
bin_indices = torch.from_numpy(X_np).to(self.device).contiguous().to(torch.int8)
|
87
|
-
|
88
|
-
# We'll store None or an empty tensor in self.bin_edges
|
89
|
-
# to indicate that we skip binning at predict-time
|
90
|
-
bin_edges = torch.arange(1, self.num_bins, dtype=torch.float32).repeat(self.num_features, 1)
|
91
|
-
bin_edges = bin_edges.to(self.device)
|
92
|
-
unique_eras, era_indices = torch.unique(era_id_gpu, return_inverse=True)
|
93
|
-
return bin_indices, era_indices, bin_edges, unique_eras, Y_gpu
|
94
|
-
else:
|
95
|
-
print("Integer input detected, but values exceed num_bins — falling back to quantile binning.")
|
96
|
-
|
97
|
-
print("Performing quantile binning on CPU...")
|
98
|
-
X_cpu = torch.from_numpy(X_np).type(torch.float32) # CPU tensor
|
99
|
-
bin_edges_cpu = self.compute_quantile_bins(X_cpu, self.num_bins).type(torch.float32).contiguous()
|
100
|
-
bin_indices_cpu = torch.empty((self.num_samples, self.num_features), dtype=torch.int8)
|
101
|
-
for f in range(self.num_features):
|
102
|
-
bin_indices_cpu[:, f] = torch.bucketize(X_cpu[:, f], bin_edges_cpu[f], right=False).type(torch.int8)
|
103
|
-
bin_indices = bin_indices_cpu.to(self.device).contiguous()
|
104
|
-
bin_edges = bin_edges_cpu.to(self.device)
|
105
|
-
unique_eras, era_indices = torch.unique(era_id_gpu, return_inverse=True)
|
106
|
-
return bin_indices, era_indices, bin_edges, unique_eras, Y_gpu
|
107
|
-
|
108
|
-
def compute_histograms(self, bin_indices_sub, gradients):
|
109
|
-
grad_hist = torch.zeros((self.num_features, self.num_bins), device=self.device, dtype=torch.float32)
|
110
|
-
hess_hist = torch.zeros((self.num_features, self.num_bins), device=self.device, dtype=torch.float32)
|
111
|
-
|
112
|
-
self.compute_histogram(
|
113
|
-
bin_indices_sub,
|
114
|
-
gradients,
|
115
|
-
grad_hist,
|
116
|
-
hess_hist,
|
117
|
-
self.num_bins,
|
118
|
-
self.threads_per_block,
|
119
|
-
self.rows_per_thread
|
120
|
-
)
|
121
|
-
return grad_hist, hess_hist
|
122
|
-
|
123
|
-
def find_best_split(self, gradient_histogram, hessian_histogram):
|
124
|
-
node_kernel.compute_split(
|
125
|
-
gradient_histogram.contiguous(),
|
126
|
-
hessian_histogram.contiguous(),
|
127
|
-
self.num_features,
|
128
|
-
self.num_bins,
|
129
|
-
self.min_split_gain,
|
130
|
-
self.min_child_weight,
|
131
|
-
self.L2_reg,
|
132
|
-
self.out_feature,
|
133
|
-
self.out_bin
|
134
|
-
)
|
135
|
-
|
136
|
-
f = int(self.out_feature[0])
|
137
|
-
b = int(self.out_bin[0])
|
138
|
-
return (f, b)
|
139
|
-
|
140
|
-
def grow_tree(self, gradient_histogram, hessian_histogram, node_indices, depth):
|
141
|
-
if depth == self.max_depth:
|
142
|
-
leaf_value = self.residual[node_indices].mean()
|
143
|
-
self.gradients[node_indices] += self.learning_rate * leaf_value
|
144
|
-
return {"leaf_value": leaf_value.item(), "samples": node_indices.numel()}
|
145
|
-
|
146
|
-
parent_size = node_indices.numel()
|
147
|
-
best_feature, best_bin = self.find_best_split(gradient_histogram, hessian_histogram)
|
148
|
-
|
149
|
-
if best_feature == -1:
|
150
|
-
leaf_value = self.residual[node_indices].mean()
|
151
|
-
self.gradients[node_indices] += self.learning_rate * leaf_value
|
152
|
-
return {"leaf_value": leaf_value.item(), "samples": parent_size}
|
153
|
-
|
154
|
-
split_mask = (self.bin_indices[node_indices, best_feature] <= best_bin)
|
155
|
-
left_indices = node_indices[split_mask]
|
156
|
-
right_indices = node_indices[~split_mask]
|
157
|
-
|
158
|
-
left_size = left_indices.numel()
|
159
|
-
right_size = right_indices.numel()
|
160
|
-
|
161
|
-
if left_size == 0 or right_size == 0:
|
162
|
-
leaf_value = self.residual[node_indices].mean()
|
163
|
-
self.gradients[node_indices] += self.learning_rate * leaf_value
|
164
|
-
return {"leaf_value": leaf_value.item(), "samples": parent_size}
|
165
|
-
|
166
|
-
if left_size <= right_size:
|
167
|
-
grad_hist_left, hess_hist_left = self.compute_histograms( self.bin_indices[left_indices], self.residual[left_indices] )
|
168
|
-
grad_hist_right = gradient_histogram - grad_hist_left
|
169
|
-
hess_hist_right = hessian_histogram - hess_hist_left
|
170
|
-
else:
|
171
|
-
grad_hist_right, hess_hist_right = self.compute_histograms( self.bin_indices[right_indices], self.residual[right_indices] )
|
172
|
-
grad_hist_left = gradient_histogram - grad_hist_right
|
173
|
-
hess_hist_left = hessian_histogram - hess_hist_right
|
174
|
-
|
175
|
-
new_depth = depth + 1
|
176
|
-
left_child = self.grow_tree(grad_hist_left, hess_hist_left, left_indices, new_depth)
|
177
|
-
right_child = self.grow_tree(grad_hist_right, hess_hist_right, right_indices, new_depth)
|
178
|
-
|
179
|
-
return { "feature": best_feature, "bin": best_bin, "left": left_child, "right": right_child }
|
180
|
-
|
181
|
-
def grow_forest(self):
|
182
|
-
forest = [{} for _ in range(self.n_estimators)]
|
183
|
-
self.training_loss = []
|
184
|
-
|
185
|
-
for i in range(self.n_estimators):
|
186
|
-
self.residual = self.Y_gpu - self.gradients
|
187
|
-
|
188
|
-
self.root_gradient_histogram, self.root_hessian_histogram = \
|
189
|
-
self.compute_histograms(self.bin_indices, self.residual)
|
190
|
-
|
191
|
-
tree = self.grow_tree(
|
192
|
-
self.root_gradient_histogram,
|
193
|
-
self.root_hessian_histogram,
|
194
|
-
self.root_node_indices,
|
195
|
-
depth=0
|
196
|
-
)
|
197
|
-
forest[i] = tree
|
198
|
-
loss = ((self.Y_gpu - self.gradients) ** 2).mean().item()
|
199
|
-
self.training_loss.append(loss)
|
200
|
-
# print(f"🌲 Tree {i+1}/{self.n_estimators} - MSE: {loss:.6f}")
|
201
|
-
|
202
|
-
print("Finished training forest.")
|
203
|
-
return forest
|
204
|
-
|
205
|
-
def predict(self, X_np, era_id_np=None):
|
206
|
-
is_integer_type = np.issubdtype(X_np.dtype, np.integer)
|
207
|
-
if is_integer_type:
|
208
|
-
max_vals = X_np.max(axis=0)
|
209
|
-
if np.all(max_vals < self.num_bins):
|
210
|
-
bin_indices = X_np.astype(np.int8)
|
211
|
-
return self.predict_data(bin_indices)
|
212
|
-
|
213
|
-
X_cpu = torch.from_numpy(X_np).type(torch.float32) # CPU tensor
|
214
|
-
bin_indices_cpu = torch.empty((X_np.shape[0], X_np.shape[1]), dtype=torch.int8)
|
215
|
-
bin_edges_cpu = self.bin_edges.to('cpu')
|
216
|
-
for f in range(self.num_features):
|
217
|
-
bin_indices_cpu[:, f] = torch.bucketize(X_cpu[:, f], bin_edges_cpu[f], right=False).type(torch.int8)
|
218
|
-
|
219
|
-
bin_indices = bin_indices_cpu.numpy() # Use CPU numpy array for predict_data
|
220
|
-
return self.predict_data(bin_indices)
|
221
|
-
|
222
|
-
@staticmethod
|
223
|
-
def process_node(node, data_idx, bin_indices):
|
224
|
-
while 'leaf_value' not in node:
|
225
|
-
if bin_indices[data_idx, node['feature']] <= node['bin']:
|
226
|
-
node = node['left']
|
227
|
-
else:
|
228
|
-
node = node['right']
|
229
|
-
return node['leaf_value']
|
230
|
-
|
231
|
-
def predict_data(self, bin_indices):
|
232
|
-
n = bin_indices.shape[0]
|
233
|
-
preds = np.zeros(n)
|
234
|
-
proc = self.process_node # local var for speed
|
235
|
-
lr = self.learning_rate
|
236
|
-
base = self.base_prediction
|
237
|
-
forest = self.forest
|
238
|
-
|
239
|
-
for i in tqdm( range(n) ):
|
240
|
-
preds[i] = base + lr * np.sum([proc( tree, i, bin_indices ) for tree in forest])
|
241
|
-
return preds
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|