warpgbm 0.1.17__tar.gz → 0.1.18__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {warpgbm-0.1.17/warpgbm.egg-info → warpgbm-0.1.18}/PKG-INFO +2 -2
- {warpgbm-0.1.17 → warpgbm-0.1.18}/README.md +1 -1
- {warpgbm-0.1.17 → warpgbm-0.1.18}/pyproject.toml +1 -1
- warpgbm-0.1.18/tests/test_fit_predict_corr.py +46 -0
- warpgbm-0.1.18/version.txt +1 -0
- {warpgbm-0.1.17 → warpgbm-0.1.18}/warpgbm/core.py +36 -60
- warpgbm-0.1.18/warpgbm/cuda/best_split_kernel.cu +79 -0
- {warpgbm-0.1.17 → warpgbm-0.1.18}/warpgbm/cuda/node_kernel.cpp +5 -6
- {warpgbm-0.1.17 → warpgbm-0.1.18/warpgbm.egg-info}/PKG-INFO +2 -2
- warpgbm-0.1.17/tests/test_fit_predict_corr.py +0 -66
- warpgbm-0.1.17/version.txt +0 -1
- warpgbm-0.1.17/warpgbm/cuda/best_split_kernel.cu +0 -112
- {warpgbm-0.1.17 → warpgbm-0.1.18}/LICENSE +0 -0
- {warpgbm-0.1.17 → warpgbm-0.1.18}/MANIFEST.in +0 -0
- {warpgbm-0.1.17 → warpgbm-0.1.18}/setup.cfg +0 -0
- {warpgbm-0.1.17 → warpgbm-0.1.18}/setup.py +0 -0
- {warpgbm-0.1.17 → warpgbm-0.1.18}/tests/__init__.py +0 -0
- {warpgbm-0.1.17 → warpgbm-0.1.18}/warpgbm/__init__.py +0 -0
- {warpgbm-0.1.17 → warpgbm-0.1.18}/warpgbm/cuda/__init__.py +0 -0
- {warpgbm-0.1.17 → warpgbm-0.1.18}/warpgbm/cuda/binner.cu +0 -0
- {warpgbm-0.1.17 → warpgbm-0.1.18}/warpgbm/cuda/histogram_kernel.cu +0 -0
- {warpgbm-0.1.17 → warpgbm-0.1.18}/warpgbm.egg-info/SOURCES.txt +0 -0
- {warpgbm-0.1.17 → warpgbm-0.1.18}/warpgbm.egg-info/dependency_links.txt +0 -0
- {warpgbm-0.1.17 → warpgbm-0.1.18}/warpgbm.egg-info/requires.txt +0 -0
- {warpgbm-0.1.17 → warpgbm-0.1.18}/warpgbm.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: warpgbm
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.18
|
4
4
|
Summary: A fast GPU-accelerated Gradient Boosted Decision Tree library with PyTorch + CUDA
|
5
5
|
License: GNU GENERAL PUBLIC LICENSE
|
6
6
|
Version 3, 29 June 2007
|
@@ -706,7 +706,7 @@ WarpGBM is a high-performance, GPU-accelerated Gradient Boosted Decision Tree (G
|
|
706
706
|
|
707
707
|
## Performance Note
|
708
708
|
|
709
|
-
In our initial tests on an NVIDIA 3090 (local) and A100 (Google Colab Pro), WarpGBM achieves **14x to 20x faster training times** compared to LightGBM using default configurations. It also consumes **significantly less RAM and CPU**. These early results hint at more thorough benchmarking to come.
|
709
|
+
In our initial tests on an NVIDIA 3090 (local) and A100 (Google Colab Pro), WarpGBM achieves **14x to 20x faster training times** compared to LightGBM's CPU version and **2x faster** on the GPU version using default configurations. Speed also outperforms XGBoost and CatBoost on regression problems. It also consumes **significantly less RAM and CPU**. These early results hint at more thorough benchmarking to come.
|
710
710
|
|
711
711
|
---
|
712
712
|
|
@@ -18,7 +18,7 @@ WarpGBM is a high-performance, GPU-accelerated Gradient Boosted Decision Tree (G
|
|
18
18
|
|
19
19
|
## Performance Note
|
20
20
|
|
21
|
-
In our initial tests on an NVIDIA 3090 (local) and A100 (Google Colab Pro), WarpGBM achieves **14x to 20x faster training times** compared to LightGBM using default configurations. It also consumes **significantly less RAM and CPU**. These early results hint at more thorough benchmarking to come.
|
21
|
+
In our initial tests on an NVIDIA 3090 (local) and A100 (Google Colab Pro), WarpGBM achieves **14x to 20x faster training times** compared to LightGBM's CPU version and **2x faster** on the GPU version using default configurations. Speed also outperforms XGBoost and CatBoost on regression problems. It also consumes **significantly less RAM and CPU**. These early results hint at more thorough benchmarking to come.
|
22
22
|
|
23
23
|
---
|
24
24
|
|
@@ -0,0 +1,46 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from warpgbm import WarpGBM
|
3
|
+
from sklearn.datasets import make_regression
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
import time
|
7
|
+
from warpgbm import WarpGBM
|
8
|
+
from sklearn.datasets import make_regression
|
9
|
+
|
10
|
+
def test_fit_predictpytee_correlation():
|
11
|
+
np.random.seed(42)
|
12
|
+
N = 100_000
|
13
|
+
F = 1000
|
14
|
+
X, y = make_regression(n_samples=N, n_features=F, noise=0.1, random_state=42)
|
15
|
+
era = np.zeros(N, dtype=np.int32)
|
16
|
+
corrs = []
|
17
|
+
|
18
|
+
for hist_type in ['hist1', 'hist2', 'hist3']:
|
19
|
+
print(f"\nTesting histogram method: {hist_type}")
|
20
|
+
|
21
|
+
model = WarpGBM(
|
22
|
+
max_depth=10,
|
23
|
+
num_bins=10,
|
24
|
+
n_estimators=10,
|
25
|
+
learning_rate=1,
|
26
|
+
verbosity=False,
|
27
|
+
histogram_computer=hist_type,
|
28
|
+
threads_per_block=128,
|
29
|
+
rows_per_thread=4
|
30
|
+
)
|
31
|
+
|
32
|
+
start_fit = time.time()
|
33
|
+
model.fit(X, y, era_id=era)
|
34
|
+
fit_time = time.time() - start_fit
|
35
|
+
print(f" Fit time: {fit_time:.3f} seconds")
|
36
|
+
|
37
|
+
start_pred = time.time()
|
38
|
+
preds = model.predict(X)
|
39
|
+
pred_time = time.time() - start_pred
|
40
|
+
print(f" Predict time: {pred_time:.3f} seconds")
|
41
|
+
|
42
|
+
corr = np.corrcoef(preds, y)[0, 1]
|
43
|
+
print(f" Correlation: {corr:.4f}")
|
44
|
+
corrs.append(corr)
|
45
|
+
|
46
|
+
assert (np.array(corrs) > 0.95).all(), f"In-sample correlation too low: {corrs}"
|
@@ -0,0 +1 @@
|
|
1
|
+
0.1.18
|
@@ -12,36 +12,6 @@ histogram_kernels = {
|
|
12
12
|
'hist3': node_kernel.compute_histogram3
|
13
13
|
}
|
14
14
|
|
15
|
-
@torch.jit.script
|
16
|
-
def jit_find_best_split(
|
17
|
-
G: Tensor, H: Tensor,
|
18
|
-
lambda_l2: float,
|
19
|
-
lambda_l1: float, # unused placeholder for now
|
20
|
-
min_split_gain: float,
|
21
|
-
min_child_weight: float
|
22
|
-
) -> Tuple[int, int]:
|
23
|
-
F, B = G.size()
|
24
|
-
Bm1 = B - 1
|
25
|
-
|
26
|
-
GH = torch.stack([G, H], dim=0).cumsum(dim=2) # [2, F, B]
|
27
|
-
GL, HL = GH[0, :, :-1], GH[1, :, :-1] # [F, B-1]
|
28
|
-
GP, HP = GH[0, :, -1:], GH[1, :, -1:] # [F, 1]
|
29
|
-
GR = GP - GL
|
30
|
-
HR = HP - HL
|
31
|
-
|
32
|
-
# Validity mask using raw child hessians
|
33
|
-
valid = (HL >= min_child_weight) & (HR >= min_child_weight)
|
34
|
-
g = (GR**2)/(HR + lambda_l2) + (GL**2)/(HL + lambda_l2) - (GP**2)/(HP + lambda_l2)
|
35
|
-
gain = torch.where(valid & (g >= min_split_gain), g, -1.0)
|
36
|
-
|
37
|
-
gain_flat = gain.view(-1)
|
38
|
-
best_idx = torch.argmax(gain_flat)
|
39
|
-
|
40
|
-
if gain_flat[best_idx].item() == float('-inf'):
|
41
|
-
return -1, -1
|
42
|
-
|
43
|
-
return best_idx // Bm1, best_idx % Bm1
|
44
|
-
|
45
15
|
class WarpGBM(BaseEstimator, RegressorMixin):
|
46
16
|
def __init__(
|
47
17
|
self,
|
@@ -76,12 +46,8 @@ class WarpGBM(BaseEstimator, RegressorMixin):
|
|
76
46
|
self.Y_gpu = None
|
77
47
|
self.num_features = None
|
78
48
|
self.num_samples = None
|
79
|
-
self.out_feature = torch.zeros(1, device=self.device, dtype=torch.int32)
|
80
|
-
self.out_bin = torch.zeros(1, device=self.device, dtype=torch.int32)
|
81
49
|
self.min_child_weight = min_child_weight
|
82
50
|
self.min_split_gain = min_split_gain
|
83
|
-
self.best_gain = torch.tensor([-float('inf')], dtype=torch.float32, device=self.device)
|
84
|
-
self.best_feature = torch.tensor([-1], dtype=torch.int32, device=self.device)
|
85
51
|
self.best_bin = torch.tensor([-1], dtype=torch.int32, device=self.device)
|
86
52
|
self.compute_histogram = histogram_kernels[histogram_computer]
|
87
53
|
self.threads_per_block = threads_per_block
|
@@ -98,8 +64,10 @@ class WarpGBM(BaseEstimator, RegressorMixin):
|
|
98
64
|
self.root_node_indices = torch.arange(self.num_samples, device=self.device)
|
99
65
|
self.base_prediction = self.Y_gpu.mean().item()
|
100
66
|
self.gradients += self.base_prediction
|
101
|
-
self.
|
102
|
-
self.
|
67
|
+
self.best_gains = torch.zeros(self.num_features, device=self.device)
|
68
|
+
self.best_bins = torch.zeros(self.num_features, device=self.device, dtype=torch.int32)
|
69
|
+
with torch.no_grad():
|
70
|
+
self.forest = self.grow_forest()
|
103
71
|
return self
|
104
72
|
|
105
73
|
def preprocess_gpu_data(self, X_np, Y_np, era_id_np):
|
@@ -156,15 +124,24 @@ class WarpGBM(BaseEstimator, RegressorMixin):
|
|
156
124
|
return grad_hist, hess_hist
|
157
125
|
|
158
126
|
def find_best_split(self, gradient_histogram, hessian_histogram):
|
159
|
-
|
127
|
+
node_kernel.compute_split(
|
160
128
|
gradient_histogram,
|
161
129
|
hessian_histogram,
|
162
|
-
self.L2_reg,
|
163
|
-
self.L1_reg,
|
164
130
|
self.min_split_gain,
|
165
131
|
self.min_child_weight,
|
132
|
+
self.L2_reg,
|
133
|
+
self.best_gains,
|
134
|
+
self.best_bins,
|
135
|
+
self.threads_per_block
|
166
136
|
)
|
167
|
-
|
137
|
+
|
138
|
+
if torch.all(self.best_bins == -1):
|
139
|
+
return -1, -1 # No valid split found
|
140
|
+
|
141
|
+
f = torch.argmax(self.best_gains).item()
|
142
|
+
b = self.best_bins[f].item()
|
143
|
+
|
144
|
+
return f, b
|
168
145
|
|
169
146
|
def grow_tree(self, gradient_histogram, hessian_histogram, node_indices, depth):
|
170
147
|
if depth == self.max_depth:
|
@@ -208,27 +185,26 @@ class WarpGBM(BaseEstimator, RegressorMixin):
|
|
208
185
|
return { "feature": best_feature, "bin": best_bin, "left": left_child, "right": right_child }
|
209
186
|
|
210
187
|
def grow_forest(self):
|
211
|
-
|
212
|
-
|
213
|
-
self.training_loss = []
|
214
|
-
|
215
|
-
for i in tqdm( range(self.n_estimators) ):
|
216
|
-
self.residual = self.Y_gpu - self.gradients
|
217
|
-
|
218
|
-
self.root_gradient_histogram, self.root_hessian_histogram = \
|
219
|
-
self.compute_histograms(self.bin_indices, self.residual)
|
220
|
-
|
221
|
-
tree = self.grow_tree(
|
222
|
-
self.root_gradient_histogram,
|
223
|
-
self.root_hessian_histogram,
|
224
|
-
self.root_node_indices,
|
225
|
-
depth=0
|
226
|
-
)
|
227
|
-
forest[i] = tree
|
228
|
-
# loss = ((self.Y_gpu - self.gradients) ** 2).mean().item()
|
229
|
-
# self.training_loss.append(loss)
|
230
|
-
# print(f"🌲 Tree {i+1}/{self.n_estimators} - MSE: {loss:.6f}")
|
188
|
+
forest = [{} for _ in range(self.n_estimators)]
|
189
|
+
self.training_loss = []
|
231
190
|
|
191
|
+
for i in tqdm( range(self.n_estimators) ):
|
192
|
+
self.residual = self.Y_gpu - self.gradients
|
193
|
+
|
194
|
+
self.root_gradient_histogram, self.root_hessian_histogram = \
|
195
|
+
self.compute_histograms(self.bin_indices, self.residual)
|
196
|
+
|
197
|
+
tree = self.grow_tree(
|
198
|
+
self.root_gradient_histogram,
|
199
|
+
self.root_hessian_histogram,
|
200
|
+
self.root_node_indices,
|
201
|
+
depth=0
|
202
|
+
)
|
203
|
+
forest[i] = tree
|
204
|
+
# loss = ((self.Y_gpu - self.gradients) ** 2).mean().item()
|
205
|
+
# self.training_loss.append(loss)
|
206
|
+
# print(f"🌲 Tree {i+1}/{self.n_estimators} - MSE: {loss:.6f}")
|
207
|
+
|
232
208
|
print("Finished training forest.")
|
233
209
|
return forest
|
234
210
|
|
@@ -0,0 +1,79 @@
|
|
1
|
+
#include <torch/extension.h>
|
2
|
+
#include <cuda.h>
|
3
|
+
#include <cuda_runtime.h>
|
4
|
+
|
5
|
+
__global__ void best_split_kernel_global_only(
|
6
|
+
const float *__restrict__ G, // [F x B]
|
7
|
+
const float *__restrict__ H, // [F x B]
|
8
|
+
int F,
|
9
|
+
int B,
|
10
|
+
float min_split_gain,
|
11
|
+
float min_child_samples,
|
12
|
+
float eps,
|
13
|
+
float *__restrict__ best_gains, // [F]
|
14
|
+
int *__restrict__ best_bins // [F]
|
15
|
+
)
|
16
|
+
{
|
17
|
+
int f = blockIdx.x * blockDim.x + threadIdx.x;
|
18
|
+
if (f >= F)
|
19
|
+
return;
|
20
|
+
|
21
|
+
float G_total = 0.0f, H_total = 0.0f;
|
22
|
+
for (int b = 0; b < B; ++b)
|
23
|
+
{
|
24
|
+
G_total += G[f * B + b];
|
25
|
+
H_total += H[f * B + b];
|
26
|
+
}
|
27
|
+
|
28
|
+
float G_L = 0.0f, H_L = 0.0f;
|
29
|
+
float best_gain = min_split_gain;
|
30
|
+
int best_bin = -1;
|
31
|
+
|
32
|
+
for (int b = 0; b < B - 1; ++b)
|
33
|
+
{
|
34
|
+
G_L += G[f * B + b];
|
35
|
+
H_L += H[f * B + b];
|
36
|
+
float G_R = G_total - G_L;
|
37
|
+
float H_R = H_total - H_L;
|
38
|
+
|
39
|
+
if (H_L >= min_child_samples && H_R >= min_child_samples)
|
40
|
+
{
|
41
|
+
float gain = (G_L * G_L) / (H_L + eps) + (G_R * G_R) / (H_R + eps);
|
42
|
+
if (gain > best_gain)
|
43
|
+
{
|
44
|
+
best_gain = gain;
|
45
|
+
best_bin = b;
|
46
|
+
}
|
47
|
+
}
|
48
|
+
}
|
49
|
+
|
50
|
+
best_gains[f] = best_gain;
|
51
|
+
best_bins[f] = best_bin;
|
52
|
+
}
|
53
|
+
|
54
|
+
void launch_best_split_kernel_cuda(
|
55
|
+
const at::Tensor &G, // [F x B]
|
56
|
+
const at::Tensor &H, // [F x B]
|
57
|
+
float min_split_gain,
|
58
|
+
float min_child_samples,
|
59
|
+
float eps,
|
60
|
+
at::Tensor &best_gains, // [F], float32
|
61
|
+
at::Tensor &best_bins, // [F], int32
|
62
|
+
int threads)
|
63
|
+
{
|
64
|
+
int F = G.size(0);
|
65
|
+
int B = G.size(1);
|
66
|
+
|
67
|
+
int blocks = (F + threads - 1) / threads;
|
68
|
+
|
69
|
+
best_split_kernel_global_only<<<blocks, threads>>>(
|
70
|
+
G.data_ptr<float>(),
|
71
|
+
H.data_ptr<float>(),
|
72
|
+
F,
|
73
|
+
B,
|
74
|
+
min_split_gain,
|
75
|
+
min_child_samples,
|
76
|
+
eps,
|
77
|
+
best_gains.data_ptr<float>(),
|
78
|
+
best_bins.data_ptr<int>());
|
79
|
+
}
|
@@ -21,15 +21,14 @@ void launch_histogram_kernel_cuda_2(
|
|
21
21
|
int rows_per_thread = 1);
|
22
22
|
|
23
23
|
void launch_best_split_kernel_cuda(
|
24
|
-
const at::Tensor &G,
|
25
|
-
const at::Tensor &H,
|
26
|
-
int F,
|
27
|
-
int B,
|
24
|
+
const at::Tensor &G, // [F x B]
|
25
|
+
const at::Tensor &H, // [F x B]
|
28
26
|
float min_split_gain,
|
29
27
|
float min_child_samples,
|
30
28
|
float eps,
|
31
|
-
at::Tensor &
|
32
|
-
at::Tensor &
|
29
|
+
at::Tensor &best_gains, // [F], float32
|
30
|
+
at::Tensor &best_bins,
|
31
|
+
int threads);
|
33
32
|
|
34
33
|
void launch_histogram_kernel_cuda_configurable(
|
35
34
|
const at::Tensor &bin_indices,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: warpgbm
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.18
|
4
4
|
Summary: A fast GPU-accelerated Gradient Boosted Decision Tree library with PyTorch + CUDA
|
5
5
|
License: GNU GENERAL PUBLIC LICENSE
|
6
6
|
Version 3, 29 June 2007
|
@@ -706,7 +706,7 @@ WarpGBM is a high-performance, GPU-accelerated Gradient Boosted Decision Tree (G
|
|
706
706
|
|
707
707
|
## Performance Note
|
708
708
|
|
709
|
-
In our initial tests on an NVIDIA 3090 (local) and A100 (Google Colab Pro), WarpGBM achieves **14x to 20x faster training times** compared to LightGBM using default configurations. It also consumes **significantly less RAM and CPU**. These early results hint at more thorough benchmarking to come.
|
709
|
+
In our initial tests on an NVIDIA 3090 (local) and A100 (Google Colab Pro), WarpGBM achieves **14x to 20x faster training times** compared to LightGBM's CPU version and **2x faster** on the GPU version using default configurations. Speed also outperforms XGBoost and CatBoost on regression problems. It also consumes **significantly less RAM and CPU**. These early results hint at more thorough benchmarking to come.
|
710
710
|
|
711
711
|
---
|
712
712
|
|
@@ -1,66 +0,0 @@
|
|
1
|
-
import numpy as np
|
2
|
-
from warpgbm import WarpGBM
|
3
|
-
from sklearn.datasets import make_regression
|
4
|
-
|
5
|
-
def test_fit_predict_correlation():
|
6
|
-
np.random.seed(42)
|
7
|
-
N = 1_000_000
|
8
|
-
F = 100
|
9
|
-
X, y = make_regression(n_samples=N, n_features=F, noise=0.1, random_state=42)
|
10
|
-
era = np.zeros(N, dtype=np.int32)
|
11
|
-
corrs = []
|
12
|
-
|
13
|
-
model = WarpGBM(
|
14
|
-
max_depth = 10,
|
15
|
-
num_bins = 10,
|
16
|
-
n_estimators = 10,
|
17
|
-
learning_rate = 1,
|
18
|
-
verbosity=False,
|
19
|
-
histogram_computer='hist1',
|
20
|
-
threads_per_block=32,
|
21
|
-
rows_per_thread=4
|
22
|
-
)
|
23
|
-
|
24
|
-
model.fit(X, y, era_id=era)
|
25
|
-
preds = model.predict(X)
|
26
|
-
|
27
|
-
# Pearson correlation in-sample
|
28
|
-
corr = np.corrcoef(preds, y)[0, 1]
|
29
|
-
corrs.append(corr)
|
30
|
-
|
31
|
-
model = WarpGBM(
|
32
|
-
max_depth = 10,
|
33
|
-
num_bins = 10,
|
34
|
-
n_estimators = 10,
|
35
|
-
learning_rate = 1,
|
36
|
-
verbosity=False,
|
37
|
-
histogram_computer='hist2',
|
38
|
-
threads_per_block=32,
|
39
|
-
rows_per_thread=4
|
40
|
-
)
|
41
|
-
|
42
|
-
model.fit(X, y, era_id=era)
|
43
|
-
preds = model.predict(X)
|
44
|
-
|
45
|
-
# Pearson correlation in-sample
|
46
|
-
corr = np.corrcoef(preds, y)[0, 1]
|
47
|
-
corrs.append(corr)
|
48
|
-
|
49
|
-
model = WarpGBM(
|
50
|
-
max_depth = 10,
|
51
|
-
num_bins = 10,
|
52
|
-
n_estimators = 10,
|
53
|
-
learning_rate = 1,
|
54
|
-
verbosity=False,
|
55
|
-
histogram_computer='hist3',
|
56
|
-
threads_per_block=32,
|
57
|
-
rows_per_thread=4
|
58
|
-
)
|
59
|
-
|
60
|
-
model.fit(X, y, era_id=era)
|
61
|
-
preds = model.predict(X)
|
62
|
-
|
63
|
-
# Pearson correlation in-sample
|
64
|
-
corr = np.corrcoef(preds, y)[0, 1]
|
65
|
-
corrs.append(corr)
|
66
|
-
assert ( np.array(corrs) > 0.95 ).all(), f"In-sample correlation too low: {corr:.4f}"
|
warpgbm-0.1.17/version.txt
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
0.1.17
|
@@ -1,112 +0,0 @@
|
|
1
|
-
#include <torch/extension.h>
|
2
|
-
#include <cuda.h>
|
3
|
-
#include <cuda_runtime.h>
|
4
|
-
|
5
|
-
__global__ void best_split_kernel(
|
6
|
-
const float *__restrict__ G, // [F x B]
|
7
|
-
const float *__restrict__ H, // [F x B]
|
8
|
-
int F,
|
9
|
-
int B,
|
10
|
-
float min_split_gain,
|
11
|
-
float min_child_samples,
|
12
|
-
float eps,
|
13
|
-
int *out_feature,
|
14
|
-
int *out_bin,
|
15
|
-
void *shared_mem)
|
16
|
-
{
|
17
|
-
int f = blockIdx.x * blockDim.x + threadIdx.x;
|
18
|
-
if (f >= F)
|
19
|
-
return;
|
20
|
-
|
21
|
-
// Cast shared memory
|
22
|
-
extern __shared__ char smem[];
|
23
|
-
float *gains = reinterpret_cast<float *>(smem);
|
24
|
-
int *features = reinterpret_cast<int *>(&gains[blockDim.x]);
|
25
|
-
int *bins = reinterpret_cast<int *>(&features[blockDim.x]);
|
26
|
-
|
27
|
-
// Calculate total G and H for this feature
|
28
|
-
float G_total = 0.0f, H_total = 0.0f;
|
29
|
-
for (int b = 0; b < B; ++b)
|
30
|
-
{
|
31
|
-
G_total += G[f * B + b];
|
32
|
-
H_total += H[f * B + b];
|
33
|
-
}
|
34
|
-
|
35
|
-
float G_L = 0.0f, H_L = 0.0f;
|
36
|
-
float best_gain = min_split_gain;
|
37
|
-
int best_bin = -1;
|
38
|
-
|
39
|
-
for (int b = 0; b < B - 1; ++b)
|
40
|
-
{
|
41
|
-
G_L += G[f * B + b];
|
42
|
-
H_L += H[f * B + b];
|
43
|
-
float G_R = G_total - G_L;
|
44
|
-
float H_R = H_total - H_L;
|
45
|
-
|
46
|
-
if (H_L > min_child_samples && H_R > min_child_samples)
|
47
|
-
{
|
48
|
-
float gain = (G_L * G_L) / (H_L + eps) + (G_R * G_R) / (H_R + eps);
|
49
|
-
if (gain > best_gain)
|
50
|
-
{
|
51
|
-
best_gain = gain;
|
52
|
-
best_bin = b;
|
53
|
-
}
|
54
|
-
}
|
55
|
-
}
|
56
|
-
|
57
|
-
gains[threadIdx.x] = best_gain;
|
58
|
-
features[threadIdx.x] = f;
|
59
|
-
bins[threadIdx.x] = best_bin;
|
60
|
-
__syncthreads();
|
61
|
-
|
62
|
-
// Thread 0 in each block finds best among its block
|
63
|
-
if (threadIdx.x == 0)
|
64
|
-
{
|
65
|
-
float block_best_gain = min_split_gain;
|
66
|
-
int block_best_feature = -1;
|
67
|
-
int block_best_bin = -1;
|
68
|
-
for (int i = 0; i < blockDim.x && blockIdx.x * blockDim.x + i < F; ++i)
|
69
|
-
{
|
70
|
-
if (gains[i] > block_best_gain)
|
71
|
-
{
|
72
|
-
block_best_gain = gains[i];
|
73
|
-
block_best_feature = features[i];
|
74
|
-
block_best_bin = bins[i];
|
75
|
-
}
|
76
|
-
}
|
77
|
-
|
78
|
-
// Write to global outputs
|
79
|
-
*out_feature = block_best_feature;
|
80
|
-
*out_bin = block_best_bin;
|
81
|
-
}
|
82
|
-
}
|
83
|
-
|
84
|
-
void launch_best_split_kernel_cuda(
|
85
|
-
const at::Tensor &G,
|
86
|
-
const at::Tensor &H,
|
87
|
-
int F,
|
88
|
-
int B,
|
89
|
-
float min_split_gain,
|
90
|
-
float min_child_samples,
|
91
|
-
float eps,
|
92
|
-
at::Tensor &out_feature,
|
93
|
-
at::Tensor &out_bin)
|
94
|
-
{
|
95
|
-
int threads = 256;
|
96
|
-
int blocks = (F + threads - 1) / threads;
|
97
|
-
|
98
|
-
size_t shared_mem_bytes = threads * (sizeof(float) + 2 * sizeof(int));
|
99
|
-
|
100
|
-
best_split_kernel<<<blocks, threads, shared_mem_bytes>>>(
|
101
|
-
G.data_ptr<float>(),
|
102
|
-
H.data_ptr<float>(),
|
103
|
-
F,
|
104
|
-
B,
|
105
|
-
min_split_gain,
|
106
|
-
min_child_samples,
|
107
|
-
eps,
|
108
|
-
out_feature.data_ptr<int>(),
|
109
|
-
out_bin.data_ptr<int>(),
|
110
|
-
nullptr // shared memory pointer not needed; just launch size
|
111
|
-
);
|
112
|
-
}
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|