warpgbm 0.1.16__tar.gz → 0.1.18__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {warpgbm-0.1.16/warpgbm.egg-info → warpgbm-0.1.18}/PKG-INFO +2 -2
- {warpgbm-0.1.16 → warpgbm-0.1.18}/README.md +1 -1
- {warpgbm-0.1.16 → warpgbm-0.1.18}/pyproject.toml +1 -1
- {warpgbm-0.1.16 → warpgbm-0.1.18}/setup.py +1 -0
- warpgbm-0.1.18/tests/test_fit_predict_corr.py +46 -0
- warpgbm-0.1.18/version.txt +1 -0
- {warpgbm-0.1.16 → warpgbm-0.1.18}/warpgbm/core.py +154 -178
- warpgbm-0.1.18/warpgbm/cuda/best_split_kernel.cu +79 -0
- warpgbm-0.1.18/warpgbm/cuda/binner.cu +52 -0
- {warpgbm-0.1.16 → warpgbm-0.1.18}/warpgbm/cuda/node_kernel.cpp +11 -6
- {warpgbm-0.1.16 → warpgbm-0.1.18/warpgbm.egg-info}/PKG-INFO +2 -2
- {warpgbm-0.1.16 → warpgbm-0.1.18}/warpgbm.egg-info/SOURCES.txt +1 -0
- warpgbm-0.1.16/tests/test_fit_predict_corr.py +0 -66
- warpgbm-0.1.16/version.txt +0 -1
- warpgbm-0.1.16/warpgbm/cuda/best_split_kernel.cu +0 -112
- {warpgbm-0.1.16 → warpgbm-0.1.18}/LICENSE +0 -0
- {warpgbm-0.1.16 → warpgbm-0.1.18}/MANIFEST.in +0 -0
- {warpgbm-0.1.16 → warpgbm-0.1.18}/setup.cfg +0 -0
- {warpgbm-0.1.16 → warpgbm-0.1.18}/tests/__init__.py +0 -0
- {warpgbm-0.1.16 → warpgbm-0.1.18}/warpgbm/__init__.py +0 -0
- {warpgbm-0.1.16 → warpgbm-0.1.18}/warpgbm/cuda/__init__.py +0 -0
- {warpgbm-0.1.16 → warpgbm-0.1.18}/warpgbm/cuda/histogram_kernel.cu +0 -0
- {warpgbm-0.1.16 → warpgbm-0.1.18}/warpgbm.egg-info/dependency_links.txt +0 -0
- {warpgbm-0.1.16 → warpgbm-0.1.18}/warpgbm.egg-info/requires.txt +0 -0
- {warpgbm-0.1.16 → warpgbm-0.1.18}/warpgbm.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: warpgbm
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.18
|
4
4
|
Summary: A fast GPU-accelerated Gradient Boosted Decision Tree library with PyTorch + CUDA
|
5
5
|
License: GNU GENERAL PUBLIC LICENSE
|
6
6
|
Version 3, 29 June 2007
|
@@ -706,7 +706,7 @@ WarpGBM is a high-performance, GPU-accelerated Gradient Boosted Decision Tree (G
|
|
706
706
|
|
707
707
|
## Performance Note
|
708
708
|
|
709
|
-
In our initial tests on an NVIDIA 3090 (local) and A100 (Google Colab Pro), WarpGBM achieves **14x to 20x faster training times** compared to LightGBM using default configurations. It also consumes **significantly less RAM and CPU**. These early results hint at more thorough benchmarking to come.
|
709
|
+
In our initial tests on an NVIDIA 3090 (local) and A100 (Google Colab Pro), WarpGBM achieves **14x to 20x faster training times** compared to LightGBM's CPU version and **2x faster** on the GPU version using default configurations. Speed also outperforms XGBoost and CatBoost on regression problems. It also consumes **significantly less RAM and CPU**. These early results hint at more thorough benchmarking to come.
|
710
710
|
|
711
711
|
---
|
712
712
|
|
@@ -18,7 +18,7 @@ WarpGBM is a high-performance, GPU-accelerated Gradient Boosted Decision Tree (G
|
|
18
18
|
|
19
19
|
## Performance Note
|
20
20
|
|
21
|
-
In our initial tests on an NVIDIA 3090 (local) and A100 (Google Colab Pro), WarpGBM achieves **14x to 20x faster training times** compared to LightGBM using default configurations. It also consumes **significantly less RAM and CPU**. These early results hint at more thorough benchmarking to come.
|
21
|
+
In our initial tests on an NVIDIA 3090 (local) and A100 (Google Colab Pro), WarpGBM achieves **14x to 20x faster training times** compared to LightGBM's CPU version and **2x faster** on the GPU version using default configurations. Speed also outperforms XGBoost and CatBoost on regression problems. It also consumes **significantly less RAM and CPU**. These early results hint at more thorough benchmarking to come.
|
22
22
|
|
23
23
|
---
|
24
24
|
|
@@ -0,0 +1,46 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from warpgbm import WarpGBM
|
3
|
+
from sklearn.datasets import make_regression
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
import time
|
7
|
+
from warpgbm import WarpGBM
|
8
|
+
from sklearn.datasets import make_regression
|
9
|
+
|
10
|
+
def test_fit_predictpytee_correlation():
|
11
|
+
np.random.seed(42)
|
12
|
+
N = 100_000
|
13
|
+
F = 1000
|
14
|
+
X, y = make_regression(n_samples=N, n_features=F, noise=0.1, random_state=42)
|
15
|
+
era = np.zeros(N, dtype=np.int32)
|
16
|
+
corrs = []
|
17
|
+
|
18
|
+
for hist_type in ['hist1', 'hist2', 'hist3']:
|
19
|
+
print(f"\nTesting histogram method: {hist_type}")
|
20
|
+
|
21
|
+
model = WarpGBM(
|
22
|
+
max_depth=10,
|
23
|
+
num_bins=10,
|
24
|
+
n_estimators=10,
|
25
|
+
learning_rate=1,
|
26
|
+
verbosity=False,
|
27
|
+
histogram_computer=hist_type,
|
28
|
+
threads_per_block=128,
|
29
|
+
rows_per_thread=4
|
30
|
+
)
|
31
|
+
|
32
|
+
start_fit = time.time()
|
33
|
+
model.fit(X, y, era_id=era)
|
34
|
+
fit_time = time.time() - start_fit
|
35
|
+
print(f" Fit time: {fit_time:.3f} seconds")
|
36
|
+
|
37
|
+
start_pred = time.time()
|
38
|
+
preds = model.predict(X)
|
39
|
+
pred_time = time.time() - start_pred
|
40
|
+
print(f" Predict time: {pred_time:.3f} seconds")
|
41
|
+
|
42
|
+
corr = np.corrcoef(preds, y)[0, 1]
|
43
|
+
print(f" Correlation: {corr:.4f}")
|
44
|
+
corrs.append(corr)
|
45
|
+
|
46
|
+
assert (np.array(corrs) > 0.95).all(), f"In-sample correlation too low: {corrs}"
|
@@ -0,0 +1 @@
|
|
1
|
+
0.1.18
|
@@ -12,40 +12,6 @@ histogram_kernels = {
|
|
12
12
|
'hist3': node_kernel.compute_histogram3
|
13
13
|
}
|
14
14
|
|
15
|
-
@torch.jit.script
|
16
|
-
def jit_find_best_split(
|
17
|
-
G: Tensor, H: Tensor,
|
18
|
-
lambda_l2: float,
|
19
|
-
lambda_l1: float, # unused placeholder for now
|
20
|
-
min_split_gain: float,
|
21
|
-
min_child_weight: float
|
22
|
-
) -> Tuple[int, int]:
|
23
|
-
F, B = G.size()
|
24
|
-
Bm1 = B - 1
|
25
|
-
eps = 0
|
26
|
-
|
27
|
-
GH = torch.stack([G, H], dim=0).cumsum(dim=2) # [2, F, B]
|
28
|
-
GL, HL_raw = GH[0, :, :-1], GH[1, :, :-1] # [F, B-1]
|
29
|
-
GP, HP = GH[0, :, -1:], GH[1, :, -1:] # [F, 1]
|
30
|
-
H_R_raw = HP - HL_raw
|
31
|
-
|
32
|
-
# Validity mask using raw child hessians
|
33
|
-
valid = (HL_raw >= min_child_weight) & (H_R_raw >= min_child_weight)
|
34
|
-
|
35
|
-
# Closed-form gain
|
36
|
-
HL, HP = HL_raw + lambda_l2, HP + lambda_l2
|
37
|
-
num = (HP * GL - HL * GP).pow(2)
|
38
|
-
denom = HP * HL * (HP - HL) + eps
|
39
|
-
gain = torch.where(valid & (num / denom >= min_split_gain), num / denom, torch.full_like(num, -float("inf")))
|
40
|
-
|
41
|
-
gain_flat = gain.view(-1)
|
42
|
-
best_idx = torch.argmax(gain_flat)
|
43
|
-
|
44
|
-
if gain_flat[best_idx].item() == float('-inf'):
|
45
|
-
return -1, -1
|
46
|
-
|
47
|
-
return best_idx // Bm1, best_idx % Bm1
|
48
|
-
|
49
15
|
class WarpGBM(BaseEstimator, RegressorMixin):
|
50
16
|
def __init__(
|
51
17
|
self,
|
@@ -80,12 +46,8 @@ class WarpGBM(BaseEstimator, RegressorMixin):
|
|
80
46
|
self.Y_gpu = None
|
81
47
|
self.num_features = None
|
82
48
|
self.num_samples = None
|
83
|
-
self.out_feature = torch.zeros(1, device=self.device, dtype=torch.int32)
|
84
|
-
self.out_bin = torch.zeros(1, device=self.device, dtype=torch.int32)
|
85
49
|
self.min_child_weight = min_child_weight
|
86
50
|
self.min_split_gain = min_split_gain
|
87
|
-
self.best_gain = torch.tensor([-float('inf')], dtype=torch.float32, device=self.device)
|
88
|
-
self.best_feature = torch.tensor([-1], dtype=torch.int32, device=self.device)
|
89
51
|
self.best_bin = torch.tensor([-1], dtype=torch.int32, device=self.device)
|
90
52
|
self.compute_histogram = histogram_kernels[histogram_computer]
|
91
53
|
self.threads_per_block = threads_per_block
|
@@ -102,45 +64,49 @@ class WarpGBM(BaseEstimator, RegressorMixin):
|
|
102
64
|
self.root_node_indices = torch.arange(self.num_samples, device=self.device)
|
103
65
|
self.base_prediction = self.Y_gpu.mean().item()
|
104
66
|
self.gradients += self.base_prediction
|
105
|
-
self.
|
106
|
-
self.
|
67
|
+
self.best_gains = torch.zeros(self.num_features, device=self.device)
|
68
|
+
self.best_bins = torch.zeros(self.num_features, device=self.device, dtype=torch.int32)
|
69
|
+
with torch.no_grad():
|
70
|
+
self.forest = self.grow_forest()
|
107
71
|
return self
|
108
|
-
|
109
|
-
def compute_quantile_bins(self, X, num_bins):
|
110
|
-
quantiles = torch.linspace(0, 1, num_bins + 1)[1:-1] # exclude 0% and 100%
|
111
|
-
bin_edges = torch.quantile(X, quantiles, dim=0) # shape: [B-1, F]
|
112
|
-
return bin_edges.T # shape: [F, B-1]
|
113
72
|
|
114
73
|
def preprocess_gpu_data(self, X_np, Y_np, era_id_np):
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
74
|
+
with torch.no_grad():
|
75
|
+
self.num_samples, self.num_features = X_np.shape
|
76
|
+
Y_gpu = torch.from_numpy(Y_np).type(torch.float32).to(self.device)
|
77
|
+
era_id_gpu = torch.from_numpy(era_id_np).type(torch.int32).to(self.device)
|
78
|
+
is_integer_type = np.issubdtype(X_np.dtype, np.integer)
|
79
|
+
if is_integer_type:
|
80
|
+
max_vals = X_np.max(axis=0)
|
81
|
+
if np.all(max_vals < self.num_bins):
|
82
|
+
print("Detected pre-binned integer input — skipping quantile binning.")
|
83
|
+
bin_indices = torch.from_numpy(X_np).to(self.device).contiguous().to(torch.int8)
|
84
|
+
|
85
|
+
# We'll store None or an empty tensor in self.bin_edges
|
86
|
+
# to indicate that we skip binning at predict-time
|
87
|
+
bin_edges = torch.arange(1, self.num_bins, dtype=torch.float32).repeat(self.num_features, 1)
|
88
|
+
bin_edges = bin_edges.to(self.device)
|
89
|
+
unique_eras, era_indices = torch.unique(era_id_gpu, return_inverse=True)
|
90
|
+
return bin_indices, era_indices, bin_edges, unique_eras, Y_gpu
|
91
|
+
else:
|
92
|
+
print("Integer input detected, but values exceed num_bins — falling back to quantile binning.")
|
93
|
+
|
94
|
+
bin_indices = torch.empty((self.num_samples, self.num_features), dtype=torch.int8, device='cuda')
|
95
|
+
bin_edges = torch.empty((self.num_features, self.num_bins - 1), dtype=torch.float32, device='cuda')
|
96
|
+
|
97
|
+
X_np = torch.from_numpy(X_np).to(torch.float32).pin_memory()
|
98
|
+
|
99
|
+
for f in range(self.num_features):
|
100
|
+
X_f = X_np[:, f].to('cuda', non_blocking=True)
|
101
|
+
quantiles = torch.linspace(0, 1, self.num_bins + 1, device='cuda', dtype=X_f.dtype)[1:-1]
|
102
|
+
bin_edges_f = torch.quantile(X_f, quantiles, dim=0).contiguous() # shape: [B-1] for 1D input
|
103
|
+
bin_indices_f = bin_indices[:, f].contiguous() # view into output
|
104
|
+
node_kernel.custom_cuda_binner(X_f, bin_edges_f, bin_indices_f)
|
105
|
+
bin_indices[:,f] = bin_indices_f
|
106
|
+
bin_edges[f,:] = bin_edges_f
|
107
|
+
|
108
|
+
unique_eras, era_indices = torch.unique(era_id_gpu, return_inverse=True)
|
109
|
+
return bin_indices, era_indices, bin_edges, unique_eras, Y_gpu
|
144
110
|
|
145
111
|
def compute_histograms(self, bin_indices_sub, gradients):
|
146
112
|
grad_hist = torch.zeros((self.num_features, self.num_bins), device=self.device, dtype=torch.float32)
|
@@ -158,15 +124,24 @@ class WarpGBM(BaseEstimator, RegressorMixin):
|
|
158
124
|
return grad_hist, hess_hist
|
159
125
|
|
160
126
|
def find_best_split(self, gradient_histogram, hessian_histogram):
|
161
|
-
|
127
|
+
node_kernel.compute_split(
|
162
128
|
gradient_histogram,
|
163
129
|
hessian_histogram,
|
164
|
-
self.L2_reg,
|
165
|
-
self.L1_reg,
|
166
130
|
self.min_split_gain,
|
167
131
|
self.min_child_weight,
|
132
|
+
self.L2_reg,
|
133
|
+
self.best_gains,
|
134
|
+
self.best_bins,
|
135
|
+
self.threads_per_block
|
168
136
|
)
|
169
|
-
|
137
|
+
|
138
|
+
if torch.all(self.best_bins == -1):
|
139
|
+
return -1, -1 # No valid split found
|
140
|
+
|
141
|
+
f = torch.argmax(self.best_gains).item()
|
142
|
+
b = self.best_bins[f].item()
|
143
|
+
|
144
|
+
return f, b
|
170
145
|
|
171
146
|
def grow_tree(self, gradient_histogram, hessian_histogram, node_indices, depth):
|
172
147
|
if depth == self.max_depth:
|
@@ -226,10 +201,10 @@ class WarpGBM(BaseEstimator, RegressorMixin):
|
|
226
201
|
depth=0
|
227
202
|
)
|
228
203
|
forest[i] = tree
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
204
|
+
# loss = ((self.Y_gpu - self.gradients) ** 2).mean().item()
|
205
|
+
# self.training_loss.append(loss)
|
206
|
+
# print(f"🌲 Tree {i+1}/{self.n_estimators} - MSE: {loss:.6f}")
|
207
|
+
|
233
208
|
print("Finished training forest.")
|
234
209
|
return forest
|
235
210
|
|
@@ -239,103 +214,104 @@ class WarpGBM(BaseEstimator, RegressorMixin):
|
|
239
214
|
We assume `flatten_forest_to_tensors` has produced self.flat_forest with
|
240
215
|
"features", "thresholds", "leaf_values", all shaped [n_trees, max_nodes].
|
241
216
|
"""
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
217
|
+
with torch.no_grad():
|
218
|
+
# 1) Convert X_np -> bin_indices
|
219
|
+
is_integer_type = np.issubdtype(X_np.dtype, np.integer)
|
220
|
+
if is_integer_type:
|
221
|
+
max_vals = X_np.max(axis=0)
|
222
|
+
if np.all(max_vals < self.num_bins):
|
223
|
+
bin_indices = X_np.astype(np.int8)
|
224
|
+
else:
|
225
|
+
raise ValueError("Pre-binned integers must be < num_bins")
|
248
226
|
else:
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
return out
|
227
|
+
X_cpu = torch.from_numpy(X_np).type(torch.float32)
|
228
|
+
bin_indices = torch.empty((X_np.shape[0], X_np.shape[1]), dtype=torch.int8)
|
229
|
+
bin_edges_cpu = self.bin_edges.to('cpu')
|
230
|
+
for f in range(self.num_features):
|
231
|
+
bin_indices[:, f] = torch.bucketize(X_cpu[:, f], bin_edges_cpu[f], right=False).type(torch.int8)
|
232
|
+
bin_indices = bin_indices.numpy()
|
233
|
+
|
234
|
+
# 2) Ensure we have a padded representation
|
235
|
+
self.flat_forest = self.flatten_forest_to_tensors(self.forest)
|
236
|
+
|
237
|
+
features_t = self.flat_forest["features"] # [n_trees, max_nodes], int16
|
238
|
+
thresholds_t = self.flat_forest["thresholds"] # [n_trees, max_nodes], int16
|
239
|
+
values_t = self.flat_forest["leaf_values"] # [n_trees, max_nodes], float32
|
240
|
+
max_nodes = self.flat_forest["max_nodes"]
|
241
|
+
|
242
|
+
n_trees = features_t.shape[0]
|
243
|
+
N = bin_indices.shape[0]
|
244
|
+
out = np.zeros(N, dtype=np.float32)
|
245
|
+
|
246
|
+
# 3) Process rows in chunks
|
247
|
+
for start in tqdm(range(0, N, chunk_size)):
|
248
|
+
end = min(start + chunk_size, N)
|
249
|
+
chunk_np = bin_indices[start:end] # shape [chunk_size, F]
|
250
|
+
chunk_gpu = torch.from_numpy(chunk_np).to(self.device) # [chunk_size, F], int8
|
251
|
+
|
252
|
+
# Accumulate raw (unscaled) leaf sums
|
253
|
+
chunk_preds = torch.zeros((end - start,), dtype=torch.float32, device=self.device)
|
254
|
+
|
255
|
+
# node_idx[i] tracks the current node index in the padded tree for row i
|
256
|
+
node_idx = torch.zeros((end - start,), dtype=torch.int32, device=self.device)
|
257
|
+
|
258
|
+
# 'active' is a boolean mask over [0..(end-start-1)], indicating which rows haven't reached a leaf
|
259
|
+
active = torch.ones((end - start,), dtype=torch.bool, device=self.device)
|
260
|
+
|
261
|
+
for t in range(n_trees):
|
262
|
+
# Reset for each tree (each tree is independent)
|
263
|
+
node_idx.fill_(0)
|
264
|
+
active.fill_(True)
|
265
|
+
|
266
|
+
tree_features = features_t[t] # shape [max_nodes], int16
|
267
|
+
tree_thresh = thresholds_t[t] # shape [max_nodes], int16
|
268
|
+
tree_values = values_t[t] # shape [max_nodes], float32
|
269
|
+
|
270
|
+
# Up to self.max_depth+1 layers
|
271
|
+
for _level in range(self.max_depth + 1):
|
272
|
+
active_idx = active.nonzero(as_tuple=True)[0]
|
273
|
+
if active_idx.numel() == 0:
|
274
|
+
break # all rows are done in this tree
|
275
|
+
|
276
|
+
current_node_idx = node_idx[active_idx]
|
277
|
+
f = tree_features[current_node_idx] # shape [#active], int16
|
278
|
+
thr = tree_thresh[current_node_idx] # shape [#active], int16
|
279
|
+
vals = tree_values[current_node_idx] # shape [#active], float32
|
280
|
+
|
281
|
+
mask_no_node = (f == -2)
|
282
|
+
mask_leaf = (f == -1)
|
283
|
+
|
284
|
+
# If leaf, add leaf value and mark inactive.
|
285
|
+
if mask_leaf.any():
|
286
|
+
leaf_rows = active_idx[mask_leaf]
|
287
|
+
chunk_preds[leaf_rows] += vals[mask_leaf]
|
288
|
+
active[leaf_rows] = False
|
289
|
+
|
290
|
+
# If no node, mark inactive.
|
291
|
+
if mask_no_node.any():
|
292
|
+
no_node_rows = active_idx[mask_no_node]
|
293
|
+
active[no_node_rows] = False
|
294
|
+
|
295
|
+
# For internal nodes, perform bin comparison.
|
296
|
+
mask_internal = (~mask_leaf & ~mask_no_node)
|
297
|
+
if mask_internal.any():
|
298
|
+
internal_rows = active_idx[mask_internal]
|
299
|
+
act_f = f[mask_internal].long()
|
300
|
+
act_thr = thr[mask_internal]
|
301
|
+
binvals = chunk_gpu[internal_rows, act_f]
|
302
|
+
go_left = (binvals <= act_thr)
|
303
|
+
new_left_idx = current_node_idx[mask_internal] * 2 + 1
|
304
|
+
new_right_idx = current_node_idx[mask_internal] * 2 + 2
|
305
|
+
node_idx[internal_rows[go_left]] = new_left_idx[go_left]
|
306
|
+
node_idx[internal_rows[~go_left]] = new_right_idx[~go_left]
|
307
|
+
# end per-tree layer loop
|
308
|
+
# end for each tree
|
309
|
+
|
310
|
+
out[start:end] = (
|
311
|
+
self.base_prediction + self.learning_rate * chunk_preds
|
312
|
+
).cpu().numpy()
|
313
|
+
|
314
|
+
return out
|
339
315
|
|
340
316
|
def flatten_forest_to_tensors(self, forest):
|
341
317
|
"""
|
@@ -0,0 +1,79 @@
|
|
1
|
+
#include <torch/extension.h>
|
2
|
+
#include <cuda.h>
|
3
|
+
#include <cuda_runtime.h>
|
4
|
+
|
5
|
+
__global__ void best_split_kernel_global_only(
|
6
|
+
const float *__restrict__ G, // [F x B]
|
7
|
+
const float *__restrict__ H, // [F x B]
|
8
|
+
int F,
|
9
|
+
int B,
|
10
|
+
float min_split_gain,
|
11
|
+
float min_child_samples,
|
12
|
+
float eps,
|
13
|
+
float *__restrict__ best_gains, // [F]
|
14
|
+
int *__restrict__ best_bins // [F]
|
15
|
+
)
|
16
|
+
{
|
17
|
+
int f = blockIdx.x * blockDim.x + threadIdx.x;
|
18
|
+
if (f >= F)
|
19
|
+
return;
|
20
|
+
|
21
|
+
float G_total = 0.0f, H_total = 0.0f;
|
22
|
+
for (int b = 0; b < B; ++b)
|
23
|
+
{
|
24
|
+
G_total += G[f * B + b];
|
25
|
+
H_total += H[f * B + b];
|
26
|
+
}
|
27
|
+
|
28
|
+
float G_L = 0.0f, H_L = 0.0f;
|
29
|
+
float best_gain = min_split_gain;
|
30
|
+
int best_bin = -1;
|
31
|
+
|
32
|
+
for (int b = 0; b < B - 1; ++b)
|
33
|
+
{
|
34
|
+
G_L += G[f * B + b];
|
35
|
+
H_L += H[f * B + b];
|
36
|
+
float G_R = G_total - G_L;
|
37
|
+
float H_R = H_total - H_L;
|
38
|
+
|
39
|
+
if (H_L >= min_child_samples && H_R >= min_child_samples)
|
40
|
+
{
|
41
|
+
float gain = (G_L * G_L) / (H_L + eps) + (G_R * G_R) / (H_R + eps);
|
42
|
+
if (gain > best_gain)
|
43
|
+
{
|
44
|
+
best_gain = gain;
|
45
|
+
best_bin = b;
|
46
|
+
}
|
47
|
+
}
|
48
|
+
}
|
49
|
+
|
50
|
+
best_gains[f] = best_gain;
|
51
|
+
best_bins[f] = best_bin;
|
52
|
+
}
|
53
|
+
|
54
|
+
void launch_best_split_kernel_cuda(
|
55
|
+
const at::Tensor &G, // [F x B]
|
56
|
+
const at::Tensor &H, // [F x B]
|
57
|
+
float min_split_gain,
|
58
|
+
float min_child_samples,
|
59
|
+
float eps,
|
60
|
+
at::Tensor &best_gains, // [F], float32
|
61
|
+
at::Tensor &best_bins, // [F], int32
|
62
|
+
int threads)
|
63
|
+
{
|
64
|
+
int F = G.size(0);
|
65
|
+
int B = G.size(1);
|
66
|
+
|
67
|
+
int blocks = (F + threads - 1) / threads;
|
68
|
+
|
69
|
+
best_split_kernel_global_only<<<blocks, threads>>>(
|
70
|
+
G.data_ptr<float>(),
|
71
|
+
H.data_ptr<float>(),
|
72
|
+
F,
|
73
|
+
B,
|
74
|
+
min_split_gain,
|
75
|
+
min_child_samples,
|
76
|
+
eps,
|
77
|
+
best_gains.data_ptr<float>(),
|
78
|
+
best_bins.data_ptr<int>());
|
79
|
+
}
|
@@ -0,0 +1,52 @@
|
|
1
|
+
#include <torch/extension.h>
|
2
|
+
#include <cuda.h>
|
3
|
+
#include <cuda_runtime.h>
|
4
|
+
|
5
|
+
__global__ void bin_column_kernel(
|
6
|
+
const float *__restrict__ X, // [N]
|
7
|
+
const float *__restrict__ bin_edges, // [B - 1]
|
8
|
+
int8_t *__restrict__ bin_indices, // [N]
|
9
|
+
int N,
|
10
|
+
int B_minus1)
|
11
|
+
{
|
12
|
+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
13
|
+
if (idx >= N)
|
14
|
+
return;
|
15
|
+
|
16
|
+
float val = X[idx];
|
17
|
+
int bin = 0;
|
18
|
+
|
19
|
+
// Linear scan over edges: bin_edges is sorted
|
20
|
+
while (bin < B_minus1 && val >= bin_edges[bin])
|
21
|
+
{
|
22
|
+
++bin;
|
23
|
+
}
|
24
|
+
|
25
|
+
bin_indices[idx] = static_cast<int8_t>(bin);
|
26
|
+
}
|
27
|
+
|
28
|
+
// C++ launcher for calling from Python
|
29
|
+
void launch_bin_column_kernel(
|
30
|
+
at::Tensor X, // [N]
|
31
|
+
at::Tensor bin_edges, // [B - 1]
|
32
|
+
at::Tensor bin_indices // [N]
|
33
|
+
)
|
34
|
+
{
|
35
|
+
const int N = X.size(0);
|
36
|
+
const int B = bin_edges.size(0);
|
37
|
+
|
38
|
+
const int threads = 256;
|
39
|
+
const int blocks = (N + threads - 1) / threads;
|
40
|
+
|
41
|
+
bin_column_kernel<<<blocks, threads>>>(
|
42
|
+
X.data_ptr<float>(),
|
43
|
+
bin_edges.data_ptr<float>(),
|
44
|
+
bin_indices.data_ptr<int8_t>(),
|
45
|
+
N,
|
46
|
+
B);
|
47
|
+
|
48
|
+
// Optional: sync and error check
|
49
|
+
cudaError_t err = cudaGetLastError();
|
50
|
+
if (err != cudaSuccess)
|
51
|
+
printf("CUDA error: %s\n", cudaGetErrorString(err));
|
52
|
+
}
|
@@ -21,15 +21,14 @@ void launch_histogram_kernel_cuda_2(
|
|
21
21
|
int rows_per_thread = 1);
|
22
22
|
|
23
23
|
void launch_best_split_kernel_cuda(
|
24
|
-
const at::Tensor &G,
|
25
|
-
const at::Tensor &H,
|
26
|
-
int F,
|
27
|
-
int B,
|
24
|
+
const at::Tensor &G, // [F x B]
|
25
|
+
const at::Tensor &H, // [F x B]
|
28
26
|
float min_split_gain,
|
29
27
|
float min_child_samples,
|
30
28
|
float eps,
|
31
|
-
at::Tensor &
|
32
|
-
at::Tensor &
|
29
|
+
at::Tensor &best_gains, // [F], float32
|
30
|
+
at::Tensor &best_bins,
|
31
|
+
int threads);
|
33
32
|
|
34
33
|
void launch_histogram_kernel_cuda_configurable(
|
35
34
|
const at::Tensor &bin_indices,
|
@@ -40,6 +39,11 @@ void launch_histogram_kernel_cuda_configurable(
|
|
40
39
|
int threads_per_block = 256,
|
41
40
|
int rows_per_thread = 1);
|
42
41
|
|
42
|
+
void launch_bin_column_kernel(
|
43
|
+
at::Tensor X,
|
44
|
+
at::Tensor bin_edges,
|
45
|
+
at::Tensor bin_indices);
|
46
|
+
|
43
47
|
// Bindings
|
44
48
|
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
|
45
49
|
{
|
@@ -47,4 +51,5 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
|
|
47
51
|
m.def("compute_histogram2", &launch_histogram_kernel_cuda_2, "Histogram (CUDA) 2");
|
48
52
|
m.def("compute_histogram3", &launch_histogram_kernel_cuda_configurable, "Histogram Feature Shared Mem");
|
49
53
|
m.def("compute_split", &launch_best_split_kernel_cuda, "Best Split (CUDA)");
|
54
|
+
m.def("custom_cuda_binner", &launch_bin_column_kernel, "Custom CUDA binning kernel");
|
50
55
|
}
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: warpgbm
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.18
|
4
4
|
Summary: A fast GPU-accelerated Gradient Boosted Decision Tree library with PyTorch + CUDA
|
5
5
|
License: GNU GENERAL PUBLIC LICENSE
|
6
6
|
Version 3, 29 June 2007
|
@@ -706,7 +706,7 @@ WarpGBM is a high-performance, GPU-accelerated Gradient Boosted Decision Tree (G
|
|
706
706
|
|
707
707
|
## Performance Note
|
708
708
|
|
709
|
-
In our initial tests on an NVIDIA 3090 (local) and A100 (Google Colab Pro), WarpGBM achieves **14x to 20x faster training times** compared to LightGBM using default configurations. It also consumes **significantly less RAM and CPU**. These early results hint at more thorough benchmarking to come.
|
709
|
+
In our initial tests on an NVIDIA 3090 (local) and A100 (Google Colab Pro), WarpGBM achieves **14x to 20x faster training times** compared to LightGBM's CPU version and **2x faster** on the GPU version using default configurations. Speed also outperforms XGBoost and CatBoost on regression problems. It also consumes **significantly less RAM and CPU**. These early results hint at more thorough benchmarking to come.
|
710
710
|
|
711
711
|
---
|
712
712
|
|
@@ -1,66 +0,0 @@
|
|
1
|
-
import numpy as np
|
2
|
-
from warpgbm import WarpGBM
|
3
|
-
from sklearn.datasets import make_regression
|
4
|
-
|
5
|
-
def test_fit_predict_correlation():
|
6
|
-
np.random.seed(42)
|
7
|
-
N = 1_000_000
|
8
|
-
F = 100
|
9
|
-
X, y = make_regression(n_samples=N, n_features=F, noise=0.1, random_state=42)
|
10
|
-
era = np.zeros(N, dtype=np.int32)
|
11
|
-
corrs = []
|
12
|
-
|
13
|
-
model = WarpGBM(
|
14
|
-
max_depth = 10,
|
15
|
-
num_bins = 10,
|
16
|
-
n_estimators = 10,
|
17
|
-
learning_rate = 1,
|
18
|
-
verbosity=False,
|
19
|
-
histogram_computer='hist1',
|
20
|
-
threads_per_block=32,
|
21
|
-
rows_per_thread=4
|
22
|
-
)
|
23
|
-
|
24
|
-
model.fit(X, y, era_id=era)
|
25
|
-
preds = model.predict(X)
|
26
|
-
|
27
|
-
# Pearson correlation in-sample
|
28
|
-
corr = np.corrcoef(preds, y)[0, 1]
|
29
|
-
corrs.append(corr)
|
30
|
-
|
31
|
-
model = WarpGBM(
|
32
|
-
max_depth = 10,
|
33
|
-
num_bins = 10,
|
34
|
-
n_estimators = 10,
|
35
|
-
learning_rate = 1,
|
36
|
-
verbosity=False,
|
37
|
-
histogram_computer='hist2',
|
38
|
-
threads_per_block=32,
|
39
|
-
rows_per_thread=4
|
40
|
-
)
|
41
|
-
|
42
|
-
model.fit(X, y, era_id=era)
|
43
|
-
preds = model.predict(X)
|
44
|
-
|
45
|
-
# Pearson correlation in-sample
|
46
|
-
corr = np.corrcoef(preds, y)[0, 1]
|
47
|
-
corrs.append(corr)
|
48
|
-
|
49
|
-
model = WarpGBM(
|
50
|
-
max_depth = 10,
|
51
|
-
num_bins = 10,
|
52
|
-
n_estimators = 10,
|
53
|
-
learning_rate = 1,
|
54
|
-
verbosity=False,
|
55
|
-
histogram_computer='hist3',
|
56
|
-
threads_per_block=32,
|
57
|
-
rows_per_thread=4
|
58
|
-
)
|
59
|
-
|
60
|
-
model.fit(X, y, era_id=era)
|
61
|
-
preds = model.predict(X)
|
62
|
-
|
63
|
-
# Pearson correlation in-sample
|
64
|
-
corr = np.corrcoef(preds, y)[0, 1]
|
65
|
-
corrs.append(corr)
|
66
|
-
assert ( np.array(corrs) > 0.95 ).all(), f"In-sample correlation too low: {corr:.4f}"
|
warpgbm-0.1.16/version.txt
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
0.1.16
|
@@ -1,112 +0,0 @@
|
|
1
|
-
#include <torch/extension.h>
|
2
|
-
#include <cuda.h>
|
3
|
-
#include <cuda_runtime.h>
|
4
|
-
|
5
|
-
__global__ void best_split_kernel(
|
6
|
-
const float *__restrict__ G, // [F x B]
|
7
|
-
const float *__restrict__ H, // [F x B]
|
8
|
-
int F,
|
9
|
-
int B,
|
10
|
-
float min_split_gain,
|
11
|
-
float min_child_samples,
|
12
|
-
float eps,
|
13
|
-
int *out_feature,
|
14
|
-
int *out_bin,
|
15
|
-
void *shared_mem)
|
16
|
-
{
|
17
|
-
int f = blockIdx.x * blockDim.x + threadIdx.x;
|
18
|
-
if (f >= F)
|
19
|
-
return;
|
20
|
-
|
21
|
-
// Cast shared memory
|
22
|
-
extern __shared__ char smem[];
|
23
|
-
float *gains = reinterpret_cast<float *>(smem);
|
24
|
-
int *features = reinterpret_cast<int *>(&gains[blockDim.x]);
|
25
|
-
int *bins = reinterpret_cast<int *>(&features[blockDim.x]);
|
26
|
-
|
27
|
-
// Calculate total G and H for this feature
|
28
|
-
float G_total = 0.0f, H_total = 0.0f;
|
29
|
-
for (int b = 0; b < B; ++b)
|
30
|
-
{
|
31
|
-
G_total += G[f * B + b];
|
32
|
-
H_total += H[f * B + b];
|
33
|
-
}
|
34
|
-
|
35
|
-
float G_L = 0.0f, H_L = 0.0f;
|
36
|
-
float best_gain = min_split_gain;
|
37
|
-
int best_bin = -1;
|
38
|
-
|
39
|
-
for (int b = 0; b < B - 1; ++b)
|
40
|
-
{
|
41
|
-
G_L += G[f * B + b];
|
42
|
-
H_L += H[f * B + b];
|
43
|
-
float G_R = G_total - G_L;
|
44
|
-
float H_R = H_total - H_L;
|
45
|
-
|
46
|
-
if (H_L > min_child_samples && H_R > min_child_samples)
|
47
|
-
{
|
48
|
-
float gain = (G_L * G_L) / (H_L + eps) + (G_R * G_R) / (H_R + eps);
|
49
|
-
if (gain > best_gain)
|
50
|
-
{
|
51
|
-
best_gain = gain;
|
52
|
-
best_bin = b;
|
53
|
-
}
|
54
|
-
}
|
55
|
-
}
|
56
|
-
|
57
|
-
gains[threadIdx.x] = best_gain;
|
58
|
-
features[threadIdx.x] = f;
|
59
|
-
bins[threadIdx.x] = best_bin;
|
60
|
-
__syncthreads();
|
61
|
-
|
62
|
-
// Thread 0 in each block finds best among its block
|
63
|
-
if (threadIdx.x == 0)
|
64
|
-
{
|
65
|
-
float block_best_gain = min_split_gain;
|
66
|
-
int block_best_feature = -1;
|
67
|
-
int block_best_bin = -1;
|
68
|
-
for (int i = 0; i < blockDim.x && blockIdx.x * blockDim.x + i < F; ++i)
|
69
|
-
{
|
70
|
-
if (gains[i] > block_best_gain)
|
71
|
-
{
|
72
|
-
block_best_gain = gains[i];
|
73
|
-
block_best_feature = features[i];
|
74
|
-
block_best_bin = bins[i];
|
75
|
-
}
|
76
|
-
}
|
77
|
-
|
78
|
-
// Write to global outputs
|
79
|
-
*out_feature = block_best_feature;
|
80
|
-
*out_bin = block_best_bin;
|
81
|
-
}
|
82
|
-
}
|
83
|
-
|
84
|
-
void launch_best_split_kernel_cuda(
|
85
|
-
const at::Tensor &G,
|
86
|
-
const at::Tensor &H,
|
87
|
-
int F,
|
88
|
-
int B,
|
89
|
-
float min_split_gain,
|
90
|
-
float min_child_samples,
|
91
|
-
float eps,
|
92
|
-
at::Tensor &out_feature,
|
93
|
-
at::Tensor &out_bin)
|
94
|
-
{
|
95
|
-
int threads = 256;
|
96
|
-
int blocks = (F + threads - 1) / threads;
|
97
|
-
|
98
|
-
size_t shared_mem_bytes = threads * (sizeof(float) + 2 * sizeof(int));
|
99
|
-
|
100
|
-
best_split_kernel<<<blocks, threads, shared_mem_bytes>>>(
|
101
|
-
G.data_ptr<float>(),
|
102
|
-
H.data_ptr<float>(),
|
103
|
-
F,
|
104
|
-
B,
|
105
|
-
min_split_gain,
|
106
|
-
min_child_samples,
|
107
|
-
eps,
|
108
|
-
out_feature.data_ptr<int>(),
|
109
|
-
out_bin.data_ptr<int>(),
|
110
|
-
nullptr // shared memory pointer not needed; just launch size
|
111
|
-
);
|
112
|
-
}
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|