warpgbm 0.1.15__tar.gz → 0.1.17__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {warpgbm-0.1.15/warpgbm.egg-info → warpgbm-0.1.17}/PKG-INFO +12 -1
- {warpgbm-0.1.15 → warpgbm-0.1.17}/README.md +11 -0
- {warpgbm-0.1.15 → warpgbm-0.1.17}/pyproject.toml +1 -1
- {warpgbm-0.1.15 → warpgbm-0.1.17}/setup.py +1 -0
- {warpgbm-0.1.15 → warpgbm-0.1.17}/tests/test_fit_predict_corr.py +4 -6
- warpgbm-0.1.17/version.txt +1 -0
- {warpgbm-0.1.15 → warpgbm-0.1.17}/warpgbm/core.py +191 -160
- warpgbm-0.1.17/warpgbm/cuda/binner.cu +52 -0
- {warpgbm-0.1.15 → warpgbm-0.1.17}/warpgbm/cuda/node_kernel.cpp +6 -0
- {warpgbm-0.1.15 → warpgbm-0.1.17/warpgbm.egg-info}/PKG-INFO +12 -1
- {warpgbm-0.1.15 → warpgbm-0.1.17}/warpgbm.egg-info/SOURCES.txt +1 -0
- warpgbm-0.1.15/version.txt +0 -1
- {warpgbm-0.1.15 → warpgbm-0.1.17}/LICENSE +0 -0
- {warpgbm-0.1.15 → warpgbm-0.1.17}/MANIFEST.in +0 -0
- {warpgbm-0.1.15 → warpgbm-0.1.17}/setup.cfg +0 -0
- {warpgbm-0.1.15 → warpgbm-0.1.17}/tests/__init__.py +0 -0
- {warpgbm-0.1.15 → warpgbm-0.1.17}/warpgbm/__init__.py +0 -0
- {warpgbm-0.1.15 → warpgbm-0.1.17}/warpgbm/cuda/__init__.py +0 -0
- {warpgbm-0.1.15 → warpgbm-0.1.17}/warpgbm/cuda/best_split_kernel.cu +0 -0
- {warpgbm-0.1.15 → warpgbm-0.1.17}/warpgbm/cuda/histogram_kernel.cu +0 -0
- {warpgbm-0.1.15 → warpgbm-0.1.17}/warpgbm.egg-info/dependency_links.txt +0 -0
- {warpgbm-0.1.15 → warpgbm-0.1.17}/warpgbm.egg-info/requires.txt +0 -0
- {warpgbm-0.1.15 → warpgbm-0.1.17}/warpgbm.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: warpgbm
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.17
|
4
4
|
Summary: A fast GPU-accelerated Gradient Boosted Decision Tree library with PyTorch + CUDA
|
5
5
|
License: GNU GENERAL PUBLIC LICENSE
|
6
6
|
Version 3, 29 June 2007
|
@@ -735,6 +735,17 @@ This installs from PyPI and also compiles CUDA code locally during installation.
|
|
735
735
|
> pip install warpgbm --no-build-isolation
|
736
736
|
> ```
|
737
737
|
|
738
|
+
### Windows
|
739
|
+
|
740
|
+
Thank you, ShatteredX, for providing working instructions for a Windows installation.
|
741
|
+
|
742
|
+
```
|
743
|
+
git clone https://github.com/jefferythewind/warpgbm.git
|
744
|
+
cd warpgbm
|
745
|
+
python setup.py bdist_wheel
|
746
|
+
pip install .\dist\warpgbm-0.1.15-cp310-cp310-win_amd64.whl
|
747
|
+
```
|
748
|
+
|
738
749
|
Before either method, make sure you’ve installed PyTorch with GPU support:\
|
739
750
|
[https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)
|
740
751
|
|
@@ -47,6 +47,17 @@ This installs from PyPI and also compiles CUDA code locally during installation.
|
|
47
47
|
> pip install warpgbm --no-build-isolation
|
48
48
|
> ```
|
49
49
|
|
50
|
+
### Windows
|
51
|
+
|
52
|
+
Thank you, ShatteredX, for providing working instructions for a Windows installation.
|
53
|
+
|
54
|
+
```
|
55
|
+
git clone https://github.com/jefferythewind/warpgbm.git
|
56
|
+
cd warpgbm
|
57
|
+
python setup.py bdist_wheel
|
58
|
+
pip install .\dist\warpgbm-0.1.15-cp310-cp310-win_amd64.whl
|
59
|
+
```
|
60
|
+
|
50
61
|
Before either method, make sure you’ve installed PyTorch with GPU support:\
|
51
62
|
[https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)
|
52
63
|
|
@@ -1,14 +1,12 @@
|
|
1
1
|
import numpy as np
|
2
2
|
from warpgbm import WarpGBM
|
3
|
+
from sklearn.datasets import make_regression
|
3
4
|
|
4
5
|
def test_fit_predict_correlation():
|
5
6
|
np.random.seed(42)
|
6
|
-
N =
|
7
|
-
F =
|
8
|
-
X =
|
9
|
-
true_weights = np.array([0.5, -1.0, 2.0, 0.0, 1.0])
|
10
|
-
noise = 0.1 * np.random.randn(N)
|
11
|
-
y = (X @ true_weights + noise).astype(np.float32)
|
7
|
+
N = 1_000_000
|
8
|
+
F = 100
|
9
|
+
X, y = make_regression(n_samples=N, n_features=F, noise=0.1, random_state=42)
|
12
10
|
era = np.zeros(N, dtype=np.int32)
|
13
11
|
corrs = []
|
14
12
|
|
@@ -0,0 +1 @@
|
|
1
|
+
0.1.17
|
@@ -3,6 +3,8 @@ import numpy as np
|
|
3
3
|
from sklearn.base import BaseEstimator, RegressorMixin
|
4
4
|
from warpgbm.cuda import node_kernel
|
5
5
|
from tqdm import tqdm
|
6
|
+
from typing import Tuple
|
7
|
+
from torch import Tensor
|
6
8
|
|
7
9
|
histogram_kernels = {
|
8
10
|
'hist1': node_kernel.compute_histogram,
|
@@ -10,6 +12,36 @@ histogram_kernels = {
|
|
10
12
|
'hist3': node_kernel.compute_histogram3
|
11
13
|
}
|
12
14
|
|
15
|
+
@torch.jit.script
|
16
|
+
def jit_find_best_split(
|
17
|
+
G: Tensor, H: Tensor,
|
18
|
+
lambda_l2: float,
|
19
|
+
lambda_l1: float, # unused placeholder for now
|
20
|
+
min_split_gain: float,
|
21
|
+
min_child_weight: float
|
22
|
+
) -> Tuple[int, int]:
|
23
|
+
F, B = G.size()
|
24
|
+
Bm1 = B - 1
|
25
|
+
|
26
|
+
GH = torch.stack([G, H], dim=0).cumsum(dim=2) # [2, F, B]
|
27
|
+
GL, HL = GH[0, :, :-1], GH[1, :, :-1] # [F, B-1]
|
28
|
+
GP, HP = GH[0, :, -1:], GH[1, :, -1:] # [F, 1]
|
29
|
+
GR = GP - GL
|
30
|
+
HR = HP - HL
|
31
|
+
|
32
|
+
# Validity mask using raw child hessians
|
33
|
+
valid = (HL >= min_child_weight) & (HR >= min_child_weight)
|
34
|
+
g = (GR**2)/(HR + lambda_l2) + (GL**2)/(HL + lambda_l2) - (GP**2)/(HP + lambda_l2)
|
35
|
+
gain = torch.where(valid & (g >= min_split_gain), g, -1.0)
|
36
|
+
|
37
|
+
gain_flat = gain.view(-1)
|
38
|
+
best_idx = torch.argmax(gain_flat)
|
39
|
+
|
40
|
+
if gain_flat[best_idx].item() == float('-inf'):
|
41
|
+
return -1, -1
|
42
|
+
|
43
|
+
return best_idx // Bm1, best_idx % Bm1
|
44
|
+
|
13
45
|
class WarpGBM(BaseEstimator, RegressorMixin):
|
14
46
|
def __init__(
|
15
47
|
self,
|
@@ -24,6 +56,7 @@ class WarpGBM(BaseEstimator, RegressorMixin):
|
|
24
56
|
threads_per_block=64,
|
25
57
|
rows_per_thread=4,
|
26
58
|
L2_reg = 1e-6,
|
59
|
+
L1_reg = 0.0,
|
27
60
|
device = 'cuda'
|
28
61
|
):
|
29
62
|
self.num_bins = num_bins
|
@@ -54,7 +87,7 @@ class WarpGBM(BaseEstimator, RegressorMixin):
|
|
54
87
|
self.threads_per_block = threads_per_block
|
55
88
|
self.rows_per_thread = rows_per_thread
|
56
89
|
self.L2_reg = L2_reg
|
57
|
-
|
90
|
+
self.L1_reg = L1_reg
|
58
91
|
|
59
92
|
def fit(self, X, y, era_id=None):
|
60
93
|
if era_id is None:
|
@@ -68,42 +101,44 @@ class WarpGBM(BaseEstimator, RegressorMixin):
|
|
68
101
|
self.split_gains = torch.zeros((self.num_features, self.num_bins - 1), device=self.device)
|
69
102
|
self.forest = self.grow_forest()
|
70
103
|
return self
|
71
|
-
|
72
|
-
def compute_quantile_bins(self, X, num_bins):
|
73
|
-
quantiles = torch.linspace(0, 1, num_bins + 1)[1:-1] # exclude 0% and 100%
|
74
|
-
bin_edges = torch.quantile(X, quantiles, dim=0) # shape: [B-1, F]
|
75
|
-
return bin_edges.T # shape: [F, B-1]
|
76
104
|
|
77
105
|
def preprocess_gpu_data(self, X_np, Y_np, era_id_np):
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
106
|
+
with torch.no_grad():
|
107
|
+
self.num_samples, self.num_features = X_np.shape
|
108
|
+
Y_gpu = torch.from_numpy(Y_np).type(torch.float32).to(self.device)
|
109
|
+
era_id_gpu = torch.from_numpy(era_id_np).type(torch.int32).to(self.device)
|
110
|
+
is_integer_type = np.issubdtype(X_np.dtype, np.integer)
|
111
|
+
if is_integer_type:
|
112
|
+
max_vals = X_np.max(axis=0)
|
113
|
+
if np.all(max_vals < self.num_bins):
|
114
|
+
print("Detected pre-binned integer input — skipping quantile binning.")
|
115
|
+
bin_indices = torch.from_numpy(X_np).to(self.device).contiguous().to(torch.int8)
|
116
|
+
|
117
|
+
# We'll store None or an empty tensor in self.bin_edges
|
118
|
+
# to indicate that we skip binning at predict-time
|
119
|
+
bin_edges = torch.arange(1, self.num_bins, dtype=torch.float32).repeat(self.num_features, 1)
|
120
|
+
bin_edges = bin_edges.to(self.device)
|
121
|
+
unique_eras, era_indices = torch.unique(era_id_gpu, return_inverse=True)
|
122
|
+
return bin_indices, era_indices, bin_edges, unique_eras, Y_gpu
|
123
|
+
else:
|
124
|
+
print("Integer input detected, but values exceed num_bins — falling back to quantile binning.")
|
125
|
+
|
126
|
+
bin_indices = torch.empty((self.num_samples, self.num_features), dtype=torch.int8, device='cuda')
|
127
|
+
bin_edges = torch.empty((self.num_features, self.num_bins - 1), dtype=torch.float32, device='cuda')
|
128
|
+
|
129
|
+
X_np = torch.from_numpy(X_np).to(torch.float32).pin_memory()
|
130
|
+
|
131
|
+
for f in range(self.num_features):
|
132
|
+
X_f = X_np[:, f].to('cuda', non_blocking=True)
|
133
|
+
quantiles = torch.linspace(0, 1, self.num_bins + 1, device='cuda', dtype=X_f.dtype)[1:-1]
|
134
|
+
bin_edges_f = torch.quantile(X_f, quantiles, dim=0).contiguous() # shape: [B-1] for 1D input
|
135
|
+
bin_indices_f = bin_indices[:, f].contiguous() # view into output
|
136
|
+
node_kernel.custom_cuda_binner(X_f, bin_edges_f, bin_indices_f)
|
137
|
+
bin_indices[:,f] = bin_indices_f
|
138
|
+
bin_edges[f,:] = bin_edges_f
|
139
|
+
|
140
|
+
unique_eras, era_indices = torch.unique(era_id_gpu, return_inverse=True)
|
141
|
+
return bin_indices, era_indices, bin_edges, unique_eras, Y_gpu
|
107
142
|
|
108
143
|
def compute_histograms(self, bin_indices_sub, gradients):
|
109
144
|
grad_hist = torch.zeros((self.num_features, self.num_bins), device=self.device, dtype=torch.float32)
|
@@ -121,20 +156,14 @@ class WarpGBM(BaseEstimator, RegressorMixin):
|
|
121
156
|
return grad_hist, hess_hist
|
122
157
|
|
123
158
|
def find_best_split(self, gradient_histogram, hessian_histogram):
|
124
|
-
|
125
|
-
gradient_histogram
|
126
|
-
hessian_histogram
|
127
|
-
self.
|
128
|
-
self.
|
159
|
+
f,b = jit_find_best_split(
|
160
|
+
gradient_histogram,
|
161
|
+
hessian_histogram,
|
162
|
+
self.L2_reg,
|
163
|
+
self.L1_reg,
|
129
164
|
self.min_split_gain,
|
130
165
|
self.min_child_weight,
|
131
|
-
self.L2_reg,
|
132
|
-
self.out_feature,
|
133
|
-
self.out_bin
|
134
166
|
)
|
135
|
-
|
136
|
-
f = int(self.out_feature[0])
|
137
|
-
b = int(self.out_bin[0])
|
138
167
|
return (f, b)
|
139
168
|
|
140
169
|
def grow_tree(self, gradient_histogram, hessian_histogram, node_indices, depth):
|
@@ -179,24 +208,25 @@ class WarpGBM(BaseEstimator, RegressorMixin):
|
|
179
208
|
return { "feature": best_feature, "bin": best_bin, "left": left_child, "right": right_child }
|
180
209
|
|
181
210
|
def grow_forest(self):
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
self.
|
190
|
-
|
191
|
-
|
192
|
-
self.
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
self.
|
211
|
+
with torch.no_grad():
|
212
|
+
forest = [{} for _ in range(self.n_estimators)]
|
213
|
+
self.training_loss = []
|
214
|
+
|
215
|
+
for i in tqdm( range(self.n_estimators) ):
|
216
|
+
self.residual = self.Y_gpu - self.gradients
|
217
|
+
|
218
|
+
self.root_gradient_histogram, self.root_hessian_histogram = \
|
219
|
+
self.compute_histograms(self.bin_indices, self.residual)
|
220
|
+
|
221
|
+
tree = self.grow_tree(
|
222
|
+
self.root_gradient_histogram,
|
223
|
+
self.root_hessian_histogram,
|
224
|
+
self.root_node_indices,
|
225
|
+
depth=0
|
226
|
+
)
|
227
|
+
forest[i] = tree
|
228
|
+
# loss = ((self.Y_gpu - self.gradients) ** 2).mean().item()
|
229
|
+
# self.training_loss.append(loss)
|
200
230
|
# print(f"🌲 Tree {i+1}/{self.n_estimators} - MSE: {loss:.6f}")
|
201
231
|
|
202
232
|
print("Finished training forest.")
|
@@ -208,103 +238,104 @@ class WarpGBM(BaseEstimator, RegressorMixin):
|
|
208
238
|
We assume `flatten_forest_to_tensors` has produced self.flat_forest with
|
209
239
|
"features", "thresholds", "leaf_values", all shaped [n_trees, max_nodes].
|
210
240
|
"""
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
241
|
+
with torch.no_grad():
|
242
|
+
# 1) Convert X_np -> bin_indices
|
243
|
+
is_integer_type = np.issubdtype(X_np.dtype, np.integer)
|
244
|
+
if is_integer_type:
|
245
|
+
max_vals = X_np.max(axis=0)
|
246
|
+
if np.all(max_vals < self.num_bins):
|
247
|
+
bin_indices = X_np.astype(np.int8)
|
248
|
+
else:
|
249
|
+
raise ValueError("Pre-binned integers must be < num_bins")
|
217
250
|
else:
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
return out
|
251
|
+
X_cpu = torch.from_numpy(X_np).type(torch.float32)
|
252
|
+
bin_indices = torch.empty((X_np.shape[0], X_np.shape[1]), dtype=torch.int8)
|
253
|
+
bin_edges_cpu = self.bin_edges.to('cpu')
|
254
|
+
for f in range(self.num_features):
|
255
|
+
bin_indices[:, f] = torch.bucketize(X_cpu[:, f], bin_edges_cpu[f], right=False).type(torch.int8)
|
256
|
+
bin_indices = bin_indices.numpy()
|
257
|
+
|
258
|
+
# 2) Ensure we have a padded representation
|
259
|
+
self.flat_forest = self.flatten_forest_to_tensors(self.forest)
|
260
|
+
|
261
|
+
features_t = self.flat_forest["features"] # [n_trees, max_nodes], int16
|
262
|
+
thresholds_t = self.flat_forest["thresholds"] # [n_trees, max_nodes], int16
|
263
|
+
values_t = self.flat_forest["leaf_values"] # [n_trees, max_nodes], float32
|
264
|
+
max_nodes = self.flat_forest["max_nodes"]
|
265
|
+
|
266
|
+
n_trees = features_t.shape[0]
|
267
|
+
N = bin_indices.shape[0]
|
268
|
+
out = np.zeros(N, dtype=np.float32)
|
269
|
+
|
270
|
+
# 3) Process rows in chunks
|
271
|
+
for start in tqdm(range(0, N, chunk_size)):
|
272
|
+
end = min(start + chunk_size, N)
|
273
|
+
chunk_np = bin_indices[start:end] # shape [chunk_size, F]
|
274
|
+
chunk_gpu = torch.from_numpy(chunk_np).to(self.device) # [chunk_size, F], int8
|
275
|
+
|
276
|
+
# Accumulate raw (unscaled) leaf sums
|
277
|
+
chunk_preds = torch.zeros((end - start,), dtype=torch.float32, device=self.device)
|
278
|
+
|
279
|
+
# node_idx[i] tracks the current node index in the padded tree for row i
|
280
|
+
node_idx = torch.zeros((end - start,), dtype=torch.int32, device=self.device)
|
281
|
+
|
282
|
+
# 'active' is a boolean mask over [0..(end-start-1)], indicating which rows haven't reached a leaf
|
283
|
+
active = torch.ones((end - start,), dtype=torch.bool, device=self.device)
|
284
|
+
|
285
|
+
for t in range(n_trees):
|
286
|
+
# Reset for each tree (each tree is independent)
|
287
|
+
node_idx.fill_(0)
|
288
|
+
active.fill_(True)
|
289
|
+
|
290
|
+
tree_features = features_t[t] # shape [max_nodes], int16
|
291
|
+
tree_thresh = thresholds_t[t] # shape [max_nodes], int16
|
292
|
+
tree_values = values_t[t] # shape [max_nodes], float32
|
293
|
+
|
294
|
+
# Up to self.max_depth+1 layers
|
295
|
+
for _level in range(self.max_depth + 1):
|
296
|
+
active_idx = active.nonzero(as_tuple=True)[0]
|
297
|
+
if active_idx.numel() == 0:
|
298
|
+
break # all rows are done in this tree
|
299
|
+
|
300
|
+
current_node_idx = node_idx[active_idx]
|
301
|
+
f = tree_features[current_node_idx] # shape [#active], int16
|
302
|
+
thr = tree_thresh[current_node_idx] # shape [#active], int16
|
303
|
+
vals = tree_values[current_node_idx] # shape [#active], float32
|
304
|
+
|
305
|
+
mask_no_node = (f == -2)
|
306
|
+
mask_leaf = (f == -1)
|
307
|
+
|
308
|
+
# If leaf, add leaf value and mark inactive.
|
309
|
+
if mask_leaf.any():
|
310
|
+
leaf_rows = active_idx[mask_leaf]
|
311
|
+
chunk_preds[leaf_rows] += vals[mask_leaf]
|
312
|
+
active[leaf_rows] = False
|
313
|
+
|
314
|
+
# If no node, mark inactive.
|
315
|
+
if mask_no_node.any():
|
316
|
+
no_node_rows = active_idx[mask_no_node]
|
317
|
+
active[no_node_rows] = False
|
318
|
+
|
319
|
+
# For internal nodes, perform bin comparison.
|
320
|
+
mask_internal = (~mask_leaf & ~mask_no_node)
|
321
|
+
if mask_internal.any():
|
322
|
+
internal_rows = active_idx[mask_internal]
|
323
|
+
act_f = f[mask_internal].long()
|
324
|
+
act_thr = thr[mask_internal]
|
325
|
+
binvals = chunk_gpu[internal_rows, act_f]
|
326
|
+
go_left = (binvals <= act_thr)
|
327
|
+
new_left_idx = current_node_idx[mask_internal] * 2 + 1
|
328
|
+
new_right_idx = current_node_idx[mask_internal] * 2 + 2
|
329
|
+
node_idx[internal_rows[go_left]] = new_left_idx[go_left]
|
330
|
+
node_idx[internal_rows[~go_left]] = new_right_idx[~go_left]
|
331
|
+
# end per-tree layer loop
|
332
|
+
# end for each tree
|
333
|
+
|
334
|
+
out[start:end] = (
|
335
|
+
self.base_prediction + self.learning_rate * chunk_preds
|
336
|
+
).cpu().numpy()
|
337
|
+
|
338
|
+
return out
|
308
339
|
|
309
340
|
def flatten_forest_to_tensors(self, forest):
|
310
341
|
"""
|
@@ -0,0 +1,52 @@
|
|
1
|
+
#include <torch/extension.h>
|
2
|
+
#include <cuda.h>
|
3
|
+
#include <cuda_runtime.h>
|
4
|
+
|
5
|
+
__global__ void bin_column_kernel(
|
6
|
+
const float *__restrict__ X, // [N]
|
7
|
+
const float *__restrict__ bin_edges, // [B - 1]
|
8
|
+
int8_t *__restrict__ bin_indices, // [N]
|
9
|
+
int N,
|
10
|
+
int B_minus1)
|
11
|
+
{
|
12
|
+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
13
|
+
if (idx >= N)
|
14
|
+
return;
|
15
|
+
|
16
|
+
float val = X[idx];
|
17
|
+
int bin = 0;
|
18
|
+
|
19
|
+
// Linear scan over edges: bin_edges is sorted
|
20
|
+
while (bin < B_minus1 && val >= bin_edges[bin])
|
21
|
+
{
|
22
|
+
++bin;
|
23
|
+
}
|
24
|
+
|
25
|
+
bin_indices[idx] = static_cast<int8_t>(bin);
|
26
|
+
}
|
27
|
+
|
28
|
+
// C++ launcher for calling from Python
|
29
|
+
void launch_bin_column_kernel(
|
30
|
+
at::Tensor X, // [N]
|
31
|
+
at::Tensor bin_edges, // [B - 1]
|
32
|
+
at::Tensor bin_indices // [N]
|
33
|
+
)
|
34
|
+
{
|
35
|
+
const int N = X.size(0);
|
36
|
+
const int B = bin_edges.size(0);
|
37
|
+
|
38
|
+
const int threads = 256;
|
39
|
+
const int blocks = (N + threads - 1) / threads;
|
40
|
+
|
41
|
+
bin_column_kernel<<<blocks, threads>>>(
|
42
|
+
X.data_ptr<float>(),
|
43
|
+
bin_edges.data_ptr<float>(),
|
44
|
+
bin_indices.data_ptr<int8_t>(),
|
45
|
+
N,
|
46
|
+
B);
|
47
|
+
|
48
|
+
// Optional: sync and error check
|
49
|
+
cudaError_t err = cudaGetLastError();
|
50
|
+
if (err != cudaSuccess)
|
51
|
+
printf("CUDA error: %s\n", cudaGetErrorString(err));
|
52
|
+
}
|
@@ -40,6 +40,11 @@ void launch_histogram_kernel_cuda_configurable(
|
|
40
40
|
int threads_per_block = 256,
|
41
41
|
int rows_per_thread = 1);
|
42
42
|
|
43
|
+
void launch_bin_column_kernel(
|
44
|
+
at::Tensor X,
|
45
|
+
at::Tensor bin_edges,
|
46
|
+
at::Tensor bin_indices);
|
47
|
+
|
43
48
|
// Bindings
|
44
49
|
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
|
45
50
|
{
|
@@ -47,4 +52,5 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
|
|
47
52
|
m.def("compute_histogram2", &launch_histogram_kernel_cuda_2, "Histogram (CUDA) 2");
|
48
53
|
m.def("compute_histogram3", &launch_histogram_kernel_cuda_configurable, "Histogram Feature Shared Mem");
|
49
54
|
m.def("compute_split", &launch_best_split_kernel_cuda, "Best Split (CUDA)");
|
55
|
+
m.def("custom_cuda_binner", &launch_bin_column_kernel, "Custom CUDA binning kernel");
|
50
56
|
}
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: warpgbm
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.17
|
4
4
|
Summary: A fast GPU-accelerated Gradient Boosted Decision Tree library with PyTorch + CUDA
|
5
5
|
License: GNU GENERAL PUBLIC LICENSE
|
6
6
|
Version 3, 29 June 2007
|
@@ -735,6 +735,17 @@ This installs from PyPI and also compiles CUDA code locally during installation.
|
|
735
735
|
> pip install warpgbm --no-build-isolation
|
736
736
|
> ```
|
737
737
|
|
738
|
+
### Windows
|
739
|
+
|
740
|
+
Thank you, ShatteredX, for providing working instructions for a Windows installation.
|
741
|
+
|
742
|
+
```
|
743
|
+
git clone https://github.com/jefferythewind/warpgbm.git
|
744
|
+
cd warpgbm
|
745
|
+
python setup.py bdist_wheel
|
746
|
+
pip install .\dist\warpgbm-0.1.15-cp310-cp310-win_amd64.whl
|
747
|
+
```
|
748
|
+
|
738
749
|
Before either method, make sure you’ve installed PyTorch with GPU support:\
|
739
750
|
[https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)
|
740
751
|
|
warpgbm-0.1.15/version.txt
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
0.1.15
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|