warpgbm 0.1.26__tar.gz → 0.1.27__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {warpgbm-0.1.26/warpgbm.egg-info → warpgbm-0.1.27}/PKG-INFO +1 -1
- {warpgbm-0.1.26 → warpgbm-0.1.27}/pyproject.toml +2 -2
- warpgbm-0.1.27/tests/full_numerai_test.py +67 -0
- warpgbm-0.1.27/tests/test_fit_predict_corr.py +52 -0
- warpgbm-0.1.27/version.txt +1 -0
- {warpgbm-0.1.26 → warpgbm-0.1.27}/warpgbm/core.py +78 -80
- warpgbm-0.1.27/warpgbm/cuda/histogram_kernel.cu +95 -0
- {warpgbm-0.1.26 → warpgbm-0.1.27}/warpgbm/cuda/node_kernel.cpp +3 -20
- warpgbm-0.1.27/warpgbm/metrics.py +10 -0
- {warpgbm-0.1.26 → warpgbm-0.1.27/warpgbm.egg-info}/PKG-INFO +1 -1
- {warpgbm-0.1.26 → warpgbm-0.1.27}/warpgbm.egg-info/SOURCES.txt +2 -0
- warpgbm-0.1.26/tests/test_fit_predict_corr.py +0 -58
- warpgbm-0.1.26/version.txt +0 -1
- warpgbm-0.1.26/warpgbm/cuda/histogram_kernel.cu +0 -250
- {warpgbm-0.1.26 → warpgbm-0.1.27}/LICENSE +0 -0
- {warpgbm-0.1.26 → warpgbm-0.1.27}/MANIFEST.in +0 -0
- {warpgbm-0.1.26 → warpgbm-0.1.27}/README.md +0 -0
- {warpgbm-0.1.26 → warpgbm-0.1.27}/setup.cfg +0 -0
- {warpgbm-0.1.26 → warpgbm-0.1.27}/setup.py +0 -0
- {warpgbm-0.1.26 → warpgbm-0.1.27}/tests/__init__.py +0 -0
- {warpgbm-0.1.26 → warpgbm-0.1.27}/tests/numerai_test.py +0 -0
- {warpgbm-0.1.26 → warpgbm-0.1.27}/warpgbm/__init__.py +0 -0
- {warpgbm-0.1.26 → warpgbm-0.1.27}/warpgbm/cuda/__init__.py +0 -0
- {warpgbm-0.1.26 → warpgbm-0.1.27}/warpgbm/cuda/best_split_kernel.cu +0 -0
- {warpgbm-0.1.26 → warpgbm-0.1.27}/warpgbm/cuda/binner.cu +0 -0
- {warpgbm-0.1.26 → warpgbm-0.1.27}/warpgbm/cuda/predict.cu +0 -0
- {warpgbm-0.1.26 → warpgbm-0.1.27}/warpgbm.egg-info/dependency_links.txt +0 -0
- {warpgbm-0.1.26 → warpgbm-0.1.27}/warpgbm.egg-info/requires.txt +0 -0
- {warpgbm-0.1.26 → warpgbm-0.1.27}/warpgbm.egg-info/top_level.txt +0 -0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "warpgbm"
|
7
|
-
version = "0.1.
|
7
|
+
version = "0.1.27"
|
8
8
|
description = "A fast GPU-accelerated Gradient Boosted Decision Tree library with PyTorch + CUDA"
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -13,5 +13,5 @@ dependencies = [
|
|
13
13
|
"torch",
|
14
14
|
"numpy",
|
15
15
|
"tqdm",
|
16
|
-
"scikit-learn"
|
16
|
+
"scikit-learn"
|
17
17
|
]
|
@@ -0,0 +1,67 @@
|
|
1
|
+
from numerapi import NumerAPI
|
2
|
+
import pandas as pd
|
3
|
+
import numpy as np
|
4
|
+
from warpgbm import WarpGBM
|
5
|
+
import time
|
6
|
+
from sklearn.metrics import mean_squared_error
|
7
|
+
|
8
|
+
|
9
|
+
def predict_in_chunks(model, X, chunk_size=100_000):
|
10
|
+
preds = []
|
11
|
+
for i in range(0, X.shape[0], chunk_size):
|
12
|
+
X_chunk = X[i : i + chunk_size]
|
13
|
+
preds.append(model.predict(X_chunk))
|
14
|
+
return np.concatenate(preds)
|
15
|
+
|
16
|
+
|
17
|
+
def test_numerai_data():
|
18
|
+
napi = NumerAPI()
|
19
|
+
napi.download_dataset("v5.0/train.parquet", "numerai_train.parquet")
|
20
|
+
napi.download_dataset("v5.0/validation.parquet", "numerai_validation.parquet")
|
21
|
+
|
22
|
+
data = pd.concat([
|
23
|
+
pd.read_parquet("numerai_train.parquet"),
|
24
|
+
pd.read_parquet("numerai_validation.parquet")
|
25
|
+
])
|
26
|
+
features = [f for f in list(data) if "feature" in f]
|
27
|
+
target = "target"
|
28
|
+
data = data.loc[data[ target].isna() == False ]
|
29
|
+
|
30
|
+
X = data[features].astype("int8").values[:]
|
31
|
+
y = data[target].values
|
32
|
+
|
33
|
+
model = WarpGBM(
|
34
|
+
max_depth=3,
|
35
|
+
num_bins=5,
|
36
|
+
n_estimators=10,
|
37
|
+
learning_rate=1,
|
38
|
+
threads_per_block=64,
|
39
|
+
rows_per_thread=4,
|
40
|
+
colsample_bytree=0.8,
|
41
|
+
)
|
42
|
+
|
43
|
+
start_fit = time.time()
|
44
|
+
model.fit(
|
45
|
+
X,
|
46
|
+
y,
|
47
|
+
# era_id=era,
|
48
|
+
# X_eval=X,
|
49
|
+
# y_eval=y,
|
50
|
+
# eval_every_n_trees=10,
|
51
|
+
# early_stopping_rounds=1,
|
52
|
+
)
|
53
|
+
fit_time = time.time() - start_fit
|
54
|
+
print(f" Fit time: {fit_time:.3f} seconds")
|
55
|
+
|
56
|
+
start_pred = time.time()
|
57
|
+
preds = predict_in_chunks(model, X, chunk_size=500_000)
|
58
|
+
pred_time = time.time() - start_pred
|
59
|
+
print(f" Predict time: {pred_time:.3f} seconds")
|
60
|
+
|
61
|
+
corr = np.corrcoef(preds, y)[0, 1]
|
62
|
+
mse = mean_squared_error(preds, y)
|
63
|
+
print(f" Correlation: {corr:.4f}")
|
64
|
+
print(f" MSE: {mse:.4f}")
|
65
|
+
|
66
|
+
# assert corr > 0.68, f"In-sample correlation too low: {corr}"
|
67
|
+
# assert mse < 0.03, f"In-sample mse too high: {mse}"
|
@@ -0,0 +1,52 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from warpgbm import WarpGBM
|
3
|
+
from sklearn.datasets import make_regression
|
4
|
+
import time
|
5
|
+
from sklearn.metrics import mean_squared_error
|
6
|
+
|
7
|
+
|
8
|
+
def test_fit_predictpytee_correlation():
|
9
|
+
np.random.seed(42)
|
10
|
+
N = 100_000
|
11
|
+
F = 1000
|
12
|
+
X, y = make_regression(n_samples=N, n_features=F, noise=0.1, random_state=42)
|
13
|
+
era = np.zeros(N, dtype=np.int32)
|
14
|
+
corrs = []
|
15
|
+
mses = []
|
16
|
+
|
17
|
+
model = WarpGBM(
|
18
|
+
max_depth=10,
|
19
|
+
num_bins=10,
|
20
|
+
n_estimators=100,
|
21
|
+
learning_rate=1,
|
22
|
+
threads_per_block=64,
|
23
|
+
rows_per_thread=4,
|
24
|
+
colsample_bytree=1.0,
|
25
|
+
)
|
26
|
+
|
27
|
+
start_fit = time.time()
|
28
|
+
model.fit(
|
29
|
+
X,
|
30
|
+
y,
|
31
|
+
era_id=era,
|
32
|
+
X_eval=X,
|
33
|
+
y_eval=y,
|
34
|
+
eval_every_n_trees=10,
|
35
|
+
early_stopping_rounds=1,
|
36
|
+
eval_metric="corr",
|
37
|
+
)
|
38
|
+
fit_time = time.time() - start_fit
|
39
|
+
print(f" Fit time: {fit_time:.3f} seconds")
|
40
|
+
|
41
|
+
start_pred = time.time()
|
42
|
+
preds = model.predict(X)
|
43
|
+
pred_time = time.time() - start_pred
|
44
|
+
print(f" Predict time: {pred_time:.3f} seconds")
|
45
|
+
|
46
|
+
corr = np.corrcoef(preds, y)[0, 1]
|
47
|
+
mse = mean_squared_error(preds, y)
|
48
|
+
print(f" Correlation: {corr:.4f}")
|
49
|
+
print(f" MSE: {mse:.4f}")
|
50
|
+
|
51
|
+
assert (corr > 0.9), f"In-sample correlation too low: {corrs}"
|
52
|
+
assert (mse < 2), f"In-sample mse too high: {mses}"
|
@@ -0,0 +1 @@
|
|
1
|
+
0.1.27
|
@@ -1,19 +1,14 @@
|
|
1
1
|
import torch
|
2
2
|
import numpy as np
|
3
3
|
from sklearn.base import BaseEstimator, RegressorMixin
|
4
|
+
from sklearn.metrics import mean_squared_log_error
|
4
5
|
from warpgbm.cuda import node_kernel
|
6
|
+
from warpgbm.metrics import rmsle_torch
|
5
7
|
from tqdm import tqdm
|
6
8
|
from typing import Tuple
|
7
9
|
from torch import Tensor
|
8
10
|
import gc
|
9
11
|
|
10
|
-
histogram_kernels = {
|
11
|
-
"hist1": node_kernel.compute_histogram,
|
12
|
-
"hist2": node_kernel.compute_histogram2,
|
13
|
-
"hist3": node_kernel.compute_histogram3,
|
14
|
-
}
|
15
|
-
|
16
|
-
|
17
12
|
class WarpGBM(BaseEstimator, RegressorMixin):
|
18
13
|
def __init__(
|
19
14
|
self,
|
@@ -23,8 +18,6 @@ class WarpGBM(BaseEstimator, RegressorMixin):
|
|
23
18
|
n_estimators=100,
|
24
19
|
min_child_weight=20,
|
25
20
|
min_split_gain=0.0,
|
26
|
-
verbosity=True,
|
27
|
-
histogram_computer="hist3",
|
28
21
|
threads_per_block=64,
|
29
22
|
rows_per_thread=4,
|
30
23
|
L2_reg=1e-6,
|
@@ -40,7 +33,6 @@ class WarpGBM(BaseEstimator, RegressorMixin):
|
|
40
33
|
n_estimators=n_estimators,
|
41
34
|
min_child_weight=min_child_weight,
|
42
35
|
min_split_gain=min_split_gain,
|
43
|
-
histogram_computer=histogram_computer,
|
44
36
|
threads_per_block=threads_per_block,
|
45
37
|
rows_per_thread=rows_per_thread,
|
46
38
|
L2_reg=L2_reg,
|
@@ -68,7 +60,6 @@ class WarpGBM(BaseEstimator, RegressorMixin):
|
|
68
60
|
self.min_child_weight = min_child_weight
|
69
61
|
self.min_split_gain = min_split_gain
|
70
62
|
self.best_bin = torch.tensor([-1], dtype=torch.int32, device=self.device)
|
71
|
-
self.compute_histogram = histogram_kernels[histogram_computer]
|
72
63
|
self.threads_per_block = threads_per_block
|
73
64
|
self.rows_per_thread = rows_per_thread
|
74
65
|
self.L2_reg = L2_reg
|
@@ -128,10 +119,6 @@ class WarpGBM(BaseEstimator, RegressorMixin):
|
|
128
119
|
)
|
129
120
|
if kwargs["L2_reg"] < 0 or kwargs["L1_reg"] < 0:
|
130
121
|
raise ValueError("L2_reg and L1_reg must be non-negative.")
|
131
|
-
if kwargs["histogram_computer"] not in histogram_kernels:
|
132
|
-
raise ValueError(
|
133
|
-
f"Invalid histogram_computer: {kwargs['histogram_computer']}. Choose from {list(histogram_kernels.keys())}."
|
134
|
-
)
|
135
122
|
if kwargs["colsample_bytree"] <= 0 or kwargs["colsample_bytree"] > 1:
|
136
123
|
raise ValueError(
|
137
124
|
f"Invalid colsample_bytree: {kwargs['colsample_bytree']}. Must be a float value > 0 and <= 1."
|
@@ -206,9 +193,9 @@ class WarpGBM(BaseEstimator, RegressorMixin):
|
|
206
193
|
# No early stopping = set to "never trigger"
|
207
194
|
early_stopping_rounds = self.n_estimators + 1
|
208
195
|
|
209
|
-
if eval_metric not in ["mse", "corr"]:
|
196
|
+
if eval_metric not in ["mse", "corr", "rmsle"]:
|
210
197
|
raise ValueError(
|
211
|
-
f"Invalid eval_metric: {eval_metric}. Choose 'mse' or 'corr'."
|
198
|
+
f"Invalid eval_metric: {eval_metric}. Choose 'mse' or 'corr', 'rmsle'."
|
212
199
|
)
|
213
200
|
|
214
201
|
return early_stopping_rounds # May have been defaulted here
|
@@ -237,14 +224,16 @@ class WarpGBM(BaseEstimator, RegressorMixin):
|
|
237
224
|
)
|
238
225
|
self.num_samples, self.num_features = X.shape
|
239
226
|
self.gradients = torch.zeros_like(self.Y_gpu)
|
240
|
-
self.root_node_indices = torch.arange(self.num_samples, device=self.device)
|
227
|
+
self.root_node_indices = torch.arange(self.num_samples, device=self.device, dtype=torch.int32)
|
241
228
|
self.base_prediction = self.Y_gpu.mean().item()
|
242
229
|
self.gradients += self.base_prediction
|
243
|
-
self.
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
self.
|
230
|
+
if self.colsample_bytree < 1.0:
|
231
|
+
k = max(1, int(self.colsample_bytree * self.num_features))
|
232
|
+
else:
|
233
|
+
k = self.num_features
|
234
|
+
self.best_gains = torch.zeros(k, device=self.device)
|
235
|
+
self.best_bins = torch.zeros(k, device=self.device, dtype=torch.int32)
|
236
|
+
self.feature_indices = torch.arange(self.num_features, device=self.device, dtype=torch.int32)
|
248
237
|
|
249
238
|
# ─── Optional Eval Set ───
|
250
239
|
if X_eval is not None and y_eval is not None:
|
@@ -273,50 +262,47 @@ class WarpGBM(BaseEstimator, RegressorMixin):
|
|
273
262
|
def preprocess_gpu_data(self, X_np, Y_np, era_id_np):
|
274
263
|
with torch.no_grad():
|
275
264
|
self.num_samples, self.num_features = X_np.shape
|
265
|
+
|
276
266
|
Y_gpu = torch.from_numpy(Y_np).type(torch.float32).to(self.device)
|
277
|
-
era_id_gpu = torch.from_numpy(era_id_np).type(torch.int32).to(self.device)
|
278
|
-
is_integer_type = np.issubdtype(X_np.dtype, np.integer)
|
279
|
-
if is_integer_type:
|
280
|
-
max_vals = X_np.max(axis=0)
|
281
|
-
if np.all(max_vals < self.num_bins):
|
282
|
-
print(
|
283
|
-
"Detected pre-binned integer input — skipping quantile binning."
|
284
|
-
)
|
285
|
-
bin_indices = (
|
286
|
-
torch.from_numpy(X_np)
|
287
|
-
.to(self.device)
|
288
|
-
.contiguous()
|
289
|
-
.to(torch.int8)
|
290
|
-
)
|
291
267
|
|
292
|
-
|
293
|
-
# to indicate that we skip binning at predict-time
|
294
|
-
bin_edges = torch.arange(
|
295
|
-
1, self.num_bins, dtype=torch.float32
|
296
|
-
).repeat(self.num_features, 1)
|
297
|
-
bin_edges = bin_edges.to(self.device)
|
298
|
-
unique_eras, era_indices = torch.unique(
|
299
|
-
era_id_gpu, return_inverse=True
|
300
|
-
)
|
301
|
-
return bin_indices, era_indices, bin_edges, unique_eras, Y_gpu
|
302
|
-
else:
|
303
|
-
print(
|
304
|
-
"Integer input detected, but values exceed num_bins — falling back to quantile binning."
|
305
|
-
)
|
268
|
+
era_id_gpu = torch.from_numpy(era_id_np).type(torch.int32).to(self.device)
|
306
269
|
|
307
270
|
bin_indices = torch.empty(
|
308
271
|
(self.num_samples, self.num_features), dtype=torch.int8, device="cuda"
|
309
272
|
)
|
273
|
+
|
274
|
+
is_integer_type = np.issubdtype(X_np.dtype, np.integer)
|
275
|
+
max_vals = X_np.max(axis=0)
|
276
|
+
|
277
|
+
if is_integer_type and np.all(max_vals < self.num_bins):
|
278
|
+
print(
|
279
|
+
"Detected pre-binned integer input — skipping quantile binning."
|
280
|
+
)
|
281
|
+
for f in range(self.num_features):
|
282
|
+
bin_indices[:,f] = torch.as_tensor( X_np[:, f], device=self.device).contiguous()
|
283
|
+
# bin_indices = X_np.to("cuda", non_blocking=True).contiguous()
|
284
|
+
|
285
|
+
# We'll store None or an empty tensor in self.bin_edges
|
286
|
+
# to indicate that we skip binning at predict-time
|
287
|
+
bin_edges = torch.arange(
|
288
|
+
1, self.num_bins, dtype=torch.float32
|
289
|
+
).repeat(self.num_features, 1)
|
290
|
+
bin_edges = bin_edges.to(self.device)
|
291
|
+
unique_eras, era_indices = torch.unique(
|
292
|
+
era_id_gpu, return_inverse=True
|
293
|
+
)
|
294
|
+
return bin_indices, era_indices, bin_edges, unique_eras, Y_gpu
|
295
|
+
|
296
|
+
print("quantile binning.")
|
297
|
+
|
310
298
|
bin_edges = torch.empty(
|
311
299
|
(self.num_features, self.num_bins - 1),
|
312
300
|
dtype=torch.float32,
|
313
301
|
device="cuda",
|
314
302
|
)
|
315
303
|
|
316
|
-
X_np = torch.from_numpy(X_np).to(torch.float32).pin_memory()
|
317
|
-
|
318
304
|
for f in range(self.num_features):
|
319
|
-
X_f = X_np[:, f].
|
305
|
+
X_f = torch.as_tensor( X_np[:, f], device=self.device, dtype=torch.float32 ).contiguous()
|
320
306
|
quantiles = torch.linspace(
|
321
307
|
0, 1, self.num_bins + 1, device="cuda", dtype=X_f.dtype
|
322
308
|
)[1:-1]
|
@@ -331,17 +317,19 @@ class WarpGBM(BaseEstimator, RegressorMixin):
|
|
331
317
|
unique_eras, era_indices = torch.unique(era_id_gpu, return_inverse=True)
|
332
318
|
return bin_indices, era_indices, bin_edges, unique_eras, Y_gpu
|
333
319
|
|
334
|
-
def compute_histograms(self,
|
320
|
+
def compute_histograms(self, sample_indices, feature_indices):
|
335
321
|
grad_hist = torch.zeros(
|
336
|
-
(
|
322
|
+
(len(feature_indices), self.num_bins), device=self.device, dtype=torch.float32
|
337
323
|
)
|
338
324
|
hess_hist = torch.zeros(
|
339
|
-
(
|
325
|
+
(len(feature_indices), self.num_bins), device=self.device, dtype=torch.float32
|
340
326
|
)
|
341
327
|
|
342
|
-
|
343
|
-
|
344
|
-
|
328
|
+
node_kernel.compute_histogram3(
|
329
|
+
self.bin_indices,
|
330
|
+
self.residual,
|
331
|
+
sample_indices,
|
332
|
+
feature_indices,
|
345
333
|
grad_hist,
|
346
334
|
hess_hist,
|
347
335
|
self.num_bins,
|
@@ -364,6 +352,9 @@ class WarpGBM(BaseEstimator, RegressorMixin):
|
|
364
352
|
|
365
353
|
if torch.all(self.best_bins == -1):
|
366
354
|
return -1, -1 # No valid split found
|
355
|
+
|
356
|
+
# print(self.best_bins)
|
357
|
+
# print(self.best_gains)
|
367
358
|
|
368
359
|
f = torch.argmax(self.best_gains).item()
|
369
360
|
b = self.best_bins[f].item()
|
@@ -381,30 +372,38 @@ class WarpGBM(BaseEstimator, RegressorMixin):
|
|
381
372
|
gradient_histogram, hessian_histogram
|
382
373
|
)
|
383
374
|
|
375
|
+
# print(local_feature, best_bin)
|
376
|
+
|
384
377
|
if local_feature == -1:
|
385
378
|
leaf_value = self.residual[node_indices].mean()
|
386
379
|
self.gradients[node_indices] += self.learning_rate * leaf_value
|
387
380
|
return {"leaf_value": leaf_value.item(), "samples": parent_size}
|
388
|
-
|
381
|
+
|
382
|
+
# print("DEBUG SHAPES -> bin_indices:", self.bin_indices.shape,
|
383
|
+
# "| node_indices max:", node_indices.max().item(),
|
384
|
+
# "| local_feature:", local_feature,
|
385
|
+
# "| feat_indices_tree len:", len(self.feat_indices_tree),
|
386
|
+
# "| feat index:", self.feat_indices_tree[local_feature])
|
387
|
+
|
389
388
|
split_mask = self.bin_indices[node_indices, self.feat_indices_tree[local_feature]] <= best_bin
|
390
389
|
left_indices = node_indices[split_mask]
|
391
390
|
right_indices = node_indices[~split_mask]
|
392
391
|
|
392
|
+
# print("DEBUG SHAPES -> left_indices:", left_indices.shape,
|
393
|
+
# "| right_indices:", right_indices.shape,
|
394
|
+
# "| parent_size:", parent_size,
|
395
|
+
# "| local_feature:", local_feature,
|
396
|
+
# "| best_bin:", best_bin)
|
397
|
+
|
393
398
|
left_size = left_indices.numel()
|
394
399
|
right_size = right_indices.numel()
|
395
400
|
|
396
401
|
if left_size <= right_size:
|
397
|
-
grad_hist_left, hess_hist_left = self.compute_histograms(
|
398
|
-
self.bin_indices.index_select(0, left_indices).index_select(1, self.feat_indices_tree)
|
399
|
-
, self.residual[left_indices]
|
400
|
-
)
|
402
|
+
grad_hist_left, hess_hist_left = self.compute_histograms( left_indices, self.feat_indices_tree )
|
401
403
|
grad_hist_right = gradient_histogram - grad_hist_left
|
402
404
|
hess_hist_right = hessian_histogram - hess_hist_left
|
403
405
|
else:
|
404
|
-
grad_hist_right, hess_hist_right = self.compute_histograms(
|
405
|
-
self.bin_indices.index_select(0, right_indices).index_select(1, self.feat_indices_tree)
|
406
|
-
, self.residual[right_indices]
|
407
|
-
)
|
406
|
+
grad_hist_right, hess_hist_right = self.compute_histograms( right_indices, self.feat_indices_tree )
|
408
407
|
grad_hist_left = gradient_histogram - grad_hist_right
|
409
408
|
hess_hist_left = hessian_histogram - hess_hist_right
|
410
409
|
|
@@ -428,6 +427,8 @@ class WarpGBM(BaseEstimator, RegressorMixin):
|
|
428
427
|
return ((y_true - y_pred) ** 2).mean().item()
|
429
428
|
elif self.eval_metric == "corr":
|
430
429
|
return 1 - torch.corrcoef(torch.vstack([y_true, y_pred]))[0, 1].item()
|
430
|
+
elif self.eval_metric == "rmsle":
|
431
|
+
return rmsle_torch(y_true, y_pred).item()
|
431
432
|
else:
|
432
433
|
raise ValueError(f"Invalid eval_metric: {self.eval_metric}.")
|
433
434
|
|
@@ -467,13 +468,9 @@ class WarpGBM(BaseEstimator, RegressorMixin):
|
|
467
468
|
self.residual = self.Y_gpu - self.gradients
|
468
469
|
|
469
470
|
if self.colsample_bytree < 1.0:
|
470
|
-
self.feat_indices_tree = torch.randperm(
|
471
|
-
self.num_features, device=self.device
|
472
|
-
)[:k]
|
471
|
+
self.feat_indices_tree = torch.randperm(self.num_features, device=self.device, dtype=torch.int32)[:k]
|
473
472
|
|
474
|
-
self.root_gradient_histogram, self.root_hessian_histogram = (
|
475
|
-
self.compute_histograms(self.bin_indices[:, self.feat_indices_tree], self.residual)
|
476
|
-
)
|
473
|
+
self.root_gradient_histogram, self.root_hessian_histogram = self.compute_histograms( self.root_node_indices, self.feat_indices_tree )
|
477
474
|
|
478
475
|
tree = self.grow_tree(
|
479
476
|
self.root_gradient_histogram,
|
@@ -491,14 +488,13 @@ class WarpGBM(BaseEstimator, RegressorMixin):
|
|
491
488
|
print("Finished training forest.")
|
492
489
|
|
493
490
|
def bin_data_with_existing_edges(self, X_np):
|
494
|
-
|
495
|
-
num_samples = X_tensor.size(0)
|
491
|
+
num_samples = X_np.shape[0]
|
496
492
|
bin_indices = torch.zeros(
|
497
493
|
(num_samples, self.num_features), dtype=torch.int8, device=self.device
|
498
494
|
)
|
499
495
|
with torch.no_grad():
|
500
496
|
for f in range(self.num_features):
|
501
|
-
X_f =
|
497
|
+
X_f = torch.as_tensor( X_np[:, f], device=self.device, dtype=torch.float32 ).contiguous()
|
502
498
|
bin_edges_f = self.bin_edges[f]
|
503
499
|
bin_indices_f = bin_indices[:, f].contiguous()
|
504
500
|
node_kernel.custom_cuda_binner(X_f, bin_edges_f, bin_indices_f)
|
@@ -537,9 +533,11 @@ class WarpGBM(BaseEstimator, RegressorMixin):
|
|
537
533
|
is_prebinned = False
|
538
534
|
|
539
535
|
if is_prebinned:
|
540
|
-
bin_indices = (
|
541
|
-
|
536
|
+
bin_indices = torch.empty(
|
537
|
+
X_np.shape, dtype=torch.int8, device="cuda"
|
542
538
|
)
|
539
|
+
for f in range(self.num_features):
|
540
|
+
bin_indices[:,f] = torch.as_tensor( X_np[:, f], device=self.device).contiguous()
|
543
541
|
else:
|
544
542
|
bin_indices = self.bin_data_with_existing_edges(X_np)
|
545
543
|
return bin_indices
|
@@ -0,0 +1,95 @@
|
|
1
|
+
#include <cuda.h>
|
2
|
+
#include <cuda_runtime.h>
|
3
|
+
#include <torch/extension.h>
|
4
|
+
|
5
|
+
__global__ void histogram_tiled_configurable_kernel(
|
6
|
+
const int8_t *__restrict__ bin_indices, // [N, F]
|
7
|
+
const float *__restrict__ residuals, // [N]
|
8
|
+
const int32_t *__restrict__ sample_indices, // [N]
|
9
|
+
const int32_t *__restrict__ feature_indices, // [F]
|
10
|
+
float *__restrict__ grad_hist, // [F * B]
|
11
|
+
float *__restrict__ hess_hist, // [F * B]
|
12
|
+
int64_t N, int64_t F, int64_t B,
|
13
|
+
int rows_per_thread)
|
14
|
+
{
|
15
|
+
int hist_feat_idx = blockIdx.x;
|
16
|
+
int feat = feature_indices[ hist_feat_idx ]; // 1 block per feature
|
17
|
+
int row_start = (blockIdx.y * blockDim.x + threadIdx.x) * rows_per_thread;
|
18
|
+
|
19
|
+
extern __shared__ float shmem[];
|
20
|
+
float *sh_grad = shmem; // [B]
|
21
|
+
float *sh_hess = &sh_grad[B]; // [B]
|
22
|
+
|
23
|
+
// Initialize shared memory histograms
|
24
|
+
for (int b = threadIdx.x; b < B; b += blockDim.x)
|
25
|
+
{
|
26
|
+
sh_grad[b] = 0.0f;
|
27
|
+
sh_hess[b] = 0.0f;
|
28
|
+
}
|
29
|
+
__syncthreads();
|
30
|
+
|
31
|
+
// Each thread processes multiple rows
|
32
|
+
for (int r = 0; r < rows_per_thread; ++r)
|
33
|
+
{
|
34
|
+
int row = row_start + r;
|
35
|
+
if (row < N)
|
36
|
+
{
|
37
|
+
int sample = sample_indices[row];
|
38
|
+
int8_t bin = bin_indices[sample * F + feat];
|
39
|
+
if (bin >= 0 && bin < B)
|
40
|
+
{
|
41
|
+
atomicAdd(&sh_grad[bin], residuals[sample]);
|
42
|
+
atomicAdd(&sh_hess[bin], 1.0f);
|
43
|
+
}
|
44
|
+
}
|
45
|
+
}
|
46
|
+
__syncthreads();
|
47
|
+
|
48
|
+
// One thread per bin writes results back to global memory
|
49
|
+
for (int b = threadIdx.x; b < B; b += blockDim.x)
|
50
|
+
{
|
51
|
+
int64_t idx = hist_feat_idx * B + b;
|
52
|
+
atomicAdd(&grad_hist[idx], sh_grad[b]);
|
53
|
+
atomicAdd(&hess_hist[idx], sh_hess[b]);
|
54
|
+
}
|
55
|
+
}
|
56
|
+
|
57
|
+
void launch_histogram_kernel_cuda_configurable(
|
58
|
+
const at::Tensor &bin_indices,
|
59
|
+
const at::Tensor &residuals,
|
60
|
+
const at::Tensor &sample_indices,
|
61
|
+
const at::Tensor &feature_indices,
|
62
|
+
at::Tensor &grad_hist,
|
63
|
+
at::Tensor &hess_hist,
|
64
|
+
int num_bins,
|
65
|
+
int threads_per_block = 256,
|
66
|
+
int rows_per_thread = 1)
|
67
|
+
{
|
68
|
+
|
69
|
+
int64_t N = sample_indices.size(0);
|
70
|
+
int64_t F = feature_indices.size(0);
|
71
|
+
int num_features_master = bin_indices.size(1);
|
72
|
+
|
73
|
+
int64_t rows_per_block = threads_per_block * rows_per_thread;
|
74
|
+
int64_t row_tiles = (N + rows_per_block - 1) / rows_per_block;
|
75
|
+
|
76
|
+
dim3 blocks(F, row_tiles); // grid.x = F, grid.y = row_tiles
|
77
|
+
dim3 threads(threads_per_block);
|
78
|
+
int shared_mem_bytes = 2 * num_bins * sizeof(float);
|
79
|
+
|
80
|
+
histogram_tiled_configurable_kernel<<<blocks, threads, shared_mem_bytes>>>(
|
81
|
+
bin_indices.data_ptr<int8_t>(),
|
82
|
+
residuals.data_ptr<float>(),
|
83
|
+
sample_indices.data_ptr<int32_t>(),
|
84
|
+
feature_indices.data_ptr<int32_t>(),
|
85
|
+
grad_hist.data_ptr<float>(),
|
86
|
+
hess_hist.data_ptr<float>(),
|
87
|
+
N, num_features_master, num_bins,
|
88
|
+
rows_per_thread);
|
89
|
+
|
90
|
+
cudaError_t err = cudaGetLastError();
|
91
|
+
if (err != cudaSuccess)
|
92
|
+
{
|
93
|
+
printf("CUDA kernel launch failed: %s\n", cudaGetErrorString(err));
|
94
|
+
}
|
95
|
+
}
|
@@ -2,23 +2,6 @@
|
|
2
2
|
#include <vector>
|
3
3
|
|
4
4
|
// Declare the function from histogram_kernel.cu
|
5
|
-
void launch_histogram_kernel_cuda(
|
6
|
-
const at::Tensor &bin_indices,
|
7
|
-
const at::Tensor &gradients,
|
8
|
-
at::Tensor &grad_hist,
|
9
|
-
at::Tensor &hess_hist,
|
10
|
-
int num_bins,
|
11
|
-
int threads_per_block = 256,
|
12
|
-
int rows_per_thread = 1);
|
13
|
-
|
14
|
-
void launch_histogram_kernel_cuda_2(
|
15
|
-
const at::Tensor &bin_indices, // int8 [N, F]
|
16
|
-
const at::Tensor &gradients, // float32 [N]
|
17
|
-
at::Tensor &grad_hist, // float32 [F * B]
|
18
|
-
at::Tensor &hess_hist, // float32 [F * B]
|
19
|
-
int num_bins,
|
20
|
-
int threads_per_block = 256,
|
21
|
-
int rows_per_thread = 1);
|
22
5
|
|
23
6
|
void launch_best_split_kernel_cuda(
|
24
7
|
const at::Tensor &G, // [F x B]
|
@@ -32,7 +15,9 @@ void launch_best_split_kernel_cuda(
|
|
32
15
|
|
33
16
|
void launch_histogram_kernel_cuda_configurable(
|
34
17
|
const at::Tensor &bin_indices,
|
35
|
-
const at::Tensor &
|
18
|
+
const at::Tensor &residual,
|
19
|
+
const at::Tensor &sample_indices,
|
20
|
+
const at::Tensor &feature_indices,
|
36
21
|
at::Tensor &grad_hist,
|
37
22
|
at::Tensor &hess_hist,
|
38
23
|
int num_bins,
|
@@ -54,8 +39,6 @@ void predict_with_forest(
|
|
54
39
|
// Bindings
|
55
40
|
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
|
56
41
|
{
|
57
|
-
m.def("compute_histogram", &launch_histogram_kernel_cuda, "Histogram (CUDA)");
|
58
|
-
m.def("compute_histogram2", &launch_histogram_kernel_cuda_2, "Histogram (CUDA) 2");
|
59
42
|
m.def("compute_histogram3", &launch_histogram_kernel_cuda_configurable, "Histogram Feature Shared Mem");
|
60
43
|
m.def("compute_split", &launch_best_split_kernel_cuda, "Best Split (CUDA)");
|
61
44
|
m.def("custom_cuda_binner", &launch_bin_column_kernel, "Custom CUDA binning kernel");
|
@@ -0,0 +1,10 @@
|
|
1
|
+
# warpgbm/metrics.py
|
2
|
+
|
3
|
+
import torch
|
4
|
+
|
5
|
+
def rmsle_torch(y_true, y_pred, eps=1e-7):
|
6
|
+
y_true = torch.clamp(y_true, min=0)
|
7
|
+
y_pred = torch.clamp(y_pred, min=0)
|
8
|
+
log_true = torch.log1p(y_true + eps)
|
9
|
+
log_pred = torch.log1p(y_pred + eps)
|
10
|
+
return torch.sqrt(torch.mean((log_true - log_pred) ** 2))
|
@@ -5,10 +5,12 @@ pyproject.toml
|
|
5
5
|
setup.py
|
6
6
|
version.txt
|
7
7
|
tests/__init__.py
|
8
|
+
tests/full_numerai_test.py
|
8
9
|
tests/numerai_test.py
|
9
10
|
tests/test_fit_predict_corr.py
|
10
11
|
warpgbm/__init__.py
|
11
12
|
warpgbm/core.py
|
13
|
+
warpgbm/metrics.py
|
12
14
|
warpgbm.egg-info/PKG-INFO
|
13
15
|
warpgbm.egg-info/SOURCES.txt
|
14
16
|
warpgbm.egg-info/dependency_links.txt
|
@@ -1,58 +0,0 @@
|
|
1
|
-
import numpy as np
|
2
|
-
from warpgbm import WarpGBM
|
3
|
-
from sklearn.datasets import make_regression
|
4
|
-
import time
|
5
|
-
from sklearn.metrics import mean_squared_error
|
6
|
-
|
7
|
-
|
8
|
-
def test_fit_predictpytee_correlation():
|
9
|
-
np.random.seed(42)
|
10
|
-
N = 100_000
|
11
|
-
F = 1000
|
12
|
-
X, y = make_regression(n_samples=N, n_features=F, noise=0.1, random_state=42)
|
13
|
-
era = np.zeros(N, dtype=np.int32)
|
14
|
-
corrs = []
|
15
|
-
mses = []
|
16
|
-
|
17
|
-
for hist_type in ["hist1", "hist2", "hist3"]:
|
18
|
-
print(f"\nTesting histogram method: {hist_type}")
|
19
|
-
|
20
|
-
model = WarpGBM(
|
21
|
-
max_depth=10,
|
22
|
-
num_bins=10,
|
23
|
-
n_estimators=100,
|
24
|
-
learning_rate=1,
|
25
|
-
verbosity=False,
|
26
|
-
histogram_computer=hist_type,
|
27
|
-
threads_per_block=64,
|
28
|
-
rows_per_thread=4,
|
29
|
-
)
|
30
|
-
|
31
|
-
start_fit = time.time()
|
32
|
-
model.fit(
|
33
|
-
X,
|
34
|
-
y,
|
35
|
-
era_id=era,
|
36
|
-
X_eval=X,
|
37
|
-
y_eval=y,
|
38
|
-
eval_every_n_trees=10,
|
39
|
-
early_stopping_rounds=1,
|
40
|
-
eval_metric="corr",
|
41
|
-
)
|
42
|
-
fit_time = time.time() - start_fit
|
43
|
-
print(f" Fit time: {fit_time:.3f} seconds")
|
44
|
-
|
45
|
-
start_pred = time.time()
|
46
|
-
preds = model.predict(X)
|
47
|
-
pred_time = time.time() - start_pred
|
48
|
-
print(f" Predict time: {pred_time:.3f} seconds")
|
49
|
-
|
50
|
-
corr = np.corrcoef(preds, y)[0, 1]
|
51
|
-
mse = mean_squared_error(preds, y)
|
52
|
-
print(f" Correlation: {corr:.4f}")
|
53
|
-
print(f" MSE: {mse:.4f}")
|
54
|
-
corrs.append(corr)
|
55
|
-
mses.append(mse)
|
56
|
-
|
57
|
-
assert (np.array(corrs) > 0.9).all(), f"In-sample correlation too low: {corrs}"
|
58
|
-
assert (np.array(mses) < 2).all(), f"In-sample mse too high: {mses}"
|
warpgbm-0.1.26/version.txt
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
0.1.26
|
@@ -1,250 +0,0 @@
|
|
1
|
-
#include <cuda.h>
|
2
|
-
#include <cuda_runtime.h>
|
3
|
-
#include <torch/extension.h>
|
4
|
-
|
5
|
-
#define F_TILE 128 // Number of features processed per block (tile)
|
6
|
-
|
7
|
-
// Each block processes a tile of features (of size up to F_TILE) and a chunk of samples.
|
8
|
-
__global__ void histogram_kernel_shared_sample(
|
9
|
-
const int8_t *__restrict__ bin_indices, // [N, F] bin indices
|
10
|
-
const float *__restrict__ gradients, // [N] gradient values
|
11
|
-
float *__restrict__ grad_hist, // [F * B] global gradient histogram (flattened)
|
12
|
-
float *__restrict__ hess_hist, // [F * B] global hessian histogram (flattened)
|
13
|
-
int64_t N, int64_t F, int64_t B)
|
14
|
-
{
|
15
|
-
// Use dynamic shared memory to hold the histogram for a tile.
|
16
|
-
// Allocate 2 arrays: one for gradients and one for hessians.
|
17
|
-
extern __shared__ float shmem[];
|
18
|
-
float *shared_grad = shmem; // size: tile_features * B floats
|
19
|
-
float *shared_hess = shmem + (F_TILE * B); // same size
|
20
|
-
|
21
|
-
int tid = threadIdx.x; // Use a 1D block (for sample processing)
|
22
|
-
int block_size = blockDim.x;
|
23
|
-
|
24
|
-
// Each block is assigned a tile of features:
|
25
|
-
int feature_offset = blockIdx.x * F_TILE;
|
26
|
-
// Adjust tile width if we're near the end of the feature dimension.
|
27
|
-
int tile_features = (feature_offset + F_TILE > F) ? (F - feature_offset) : F_TILE;
|
28
|
-
int tile_size = tile_features * B; // total number of bins in this feature tile
|
29
|
-
|
30
|
-
// Initialize the tile’s shared memory histograms.
|
31
|
-
for (int i = tid; i < tile_size; i += block_size)
|
32
|
-
{
|
33
|
-
shared_grad[i] = 0.0f;
|
34
|
-
shared_hess[i] = 0.0f;
|
35
|
-
}
|
36
|
-
__syncthreads();
|
37
|
-
|
38
|
-
// Each block also covers a chunk of samples. Determine the sample index
|
39
|
-
int sample = blockIdx.y * block_size + tid;
|
40
|
-
if (sample < N)
|
41
|
-
{
|
42
|
-
// For each feature in this tile, compute the bin and update shared histograms.
|
43
|
-
for (int j = 0; j < tile_features; j++)
|
44
|
-
{
|
45
|
-
// Global feature index.
|
46
|
-
int f_idx = feature_offset + j;
|
47
|
-
int64_t idx = sample * F + f_idx; // index into the [N, F] bin_indices tensor
|
48
|
-
int8_t b = bin_indices[idx]; // get bin index
|
49
|
-
if (b >= 0 && b < B)
|
50
|
-
{
|
51
|
-
int shared_idx = j * B + b; // index into the tile histogram in shared memory
|
52
|
-
// Using atomics because several threads may update the same bin.
|
53
|
-
atomicAdd(&shared_grad[shared_idx], gradients[sample]);
|
54
|
-
atomicAdd(&shared_hess[shared_idx], 1.0f);
|
55
|
-
}
|
56
|
-
}
|
57
|
-
}
|
58
|
-
__syncthreads();
|
59
|
-
|
60
|
-
// Flush the per-tile histograms from shared memory to global memory.
|
61
|
-
// Each bin in the tile is added to the global histogram (which is sized [F, B]).
|
62
|
-
for (int i = tid; i < tile_size; i += block_size)
|
63
|
-
{
|
64
|
-
int local_feature = i / B; // feature index relative to the tile
|
65
|
-
int bin = i % B; // bin index
|
66
|
-
int f_idx = feature_offset + local_feature;
|
67
|
-
if (f_idx < F)
|
68
|
-
{
|
69
|
-
int global_idx = f_idx * B + bin;
|
70
|
-
atomicAdd(&grad_hist[global_idx], shared_grad[i]);
|
71
|
-
atomicAdd(&hess_hist[global_idx], shared_hess[i]);
|
72
|
-
}
|
73
|
-
}
|
74
|
-
}
|
75
|
-
|
76
|
-
void launch_histogram_kernel_cuda(
|
77
|
-
const at::Tensor &bin_indices, // [N, F] int8 tensor
|
78
|
-
const at::Tensor &gradients, // [N] float tensor
|
79
|
-
at::Tensor &grad_hist, // [F * B] float tensor (preallocated)
|
80
|
-
at::Tensor &hess_hist, // [F * B] float tensor (preallocated)
|
81
|
-
int num_bins,
|
82
|
-
int threads_per_block = 256,
|
83
|
-
int rows_per_thread = 1)
|
84
|
-
{
|
85
|
-
int64_t N = bin_indices.size(0);
|
86
|
-
int64_t F = bin_indices.size(1);
|
87
|
-
int64_t B = num_bins;
|
88
|
-
|
89
|
-
// Define grid and block dimensions.
|
90
|
-
// blockDim.x: number of threads per block (for processing samples).
|
91
|
-
// gridDim.x: number of feature tiles.
|
92
|
-
int grid_x = (F + F_TILE - 1) / F_TILE;
|
93
|
-
// gridDim.y: number of sample chunks.
|
94
|
-
int grid_y = (N + threads_per_block - 1) / threads_per_block;
|
95
|
-
dim3 blocks(grid_x, grid_y);
|
96
|
-
dim3 threads(threads_per_block);
|
97
|
-
|
98
|
-
// Calculate shared memory size:
|
99
|
-
// We allocate 2 arrays of size (F_TILE * B) floats (one for grad and one for hess).
|
100
|
-
size_t shared_mem_size = 2 * F_TILE * B * sizeof(float);
|
101
|
-
|
102
|
-
histogram_kernel_shared_sample<<<blocks, threads, shared_mem_size>>>(
|
103
|
-
bin_indices.data_ptr<int8_t>(),
|
104
|
-
gradients.data_ptr<float>(),
|
105
|
-
grad_hist.data_ptr<float>(),
|
106
|
-
hess_hist.data_ptr<float>(),
|
107
|
-
N, F, B);
|
108
|
-
}
|
109
|
-
|
110
|
-
// CUDA kernel: tiled, 64-bit safe
|
111
|
-
__global__ void histogram_tiled_kernel(
|
112
|
-
const int8_t *__restrict__ bin_indices, // [N, F]
|
113
|
-
const float *__restrict__ gradients, // [N]
|
114
|
-
float *__restrict__ grad_hist, // [F * B]
|
115
|
-
float *__restrict__ hess_hist, // [F * B]
|
116
|
-
int64_t F, int64_t B, int64_t tile_size)
|
117
|
-
{
|
118
|
-
int64_t feature_tiles = (F + tile_size - 1) / tile_size;
|
119
|
-
int64_t row = static_cast<int64_t>(blockIdx.x) / feature_tiles;
|
120
|
-
int64_t tile = static_cast<int64_t>(blockIdx.x) % feature_tiles;
|
121
|
-
int64_t feat = tile * tile_size + threadIdx.x;
|
122
|
-
|
123
|
-
if (feat >= F)
|
124
|
-
return;
|
125
|
-
|
126
|
-
int8_t bin = bin_indices[row * F + feat];
|
127
|
-
if (bin >= 0 && bin < B)
|
128
|
-
{
|
129
|
-
int64_t idx = feat * B + bin;
|
130
|
-
atomicAdd(&grad_hist[idx], gradients[row]);
|
131
|
-
atomicAdd(&hess_hist[idx], 1.0f);
|
132
|
-
}
|
133
|
-
}
|
134
|
-
|
135
|
-
// Host function exposed to PyTorch
|
136
|
-
void launch_histogram_kernel_cuda_2(
|
137
|
-
const at::Tensor &bin_indices, // int8 [N, F]
|
138
|
-
const at::Tensor &gradients, // float32 [N]
|
139
|
-
at::Tensor &grad_hist, // float32 [F * B]
|
140
|
-
at::Tensor &hess_hist, // float32 [F * B]
|
141
|
-
int num_bins,
|
142
|
-
int threads_per_block = 256,
|
143
|
-
int rows_per_thread = 1)
|
144
|
-
{
|
145
|
-
|
146
|
-
int64_t N = bin_indices.size(0);
|
147
|
-
int64_t F = bin_indices.size(1);
|
148
|
-
int64_t tile_size = threads_per_block;
|
149
|
-
int64_t feature_tiles = (F + tile_size - 1) / tile_size;
|
150
|
-
int64_t total_blocks = N * feature_tiles;
|
151
|
-
|
152
|
-
histogram_tiled_kernel<<<
|
153
|
-
static_cast<int>(total_blocks),
|
154
|
-
static_cast<int>(tile_size)>>>(
|
155
|
-
bin_indices.data_ptr<int8_t>(),
|
156
|
-
gradients.data_ptr<float>(),
|
157
|
-
grad_hist.data_ptr<float>(),
|
158
|
-
hess_hist.data_ptr<float>(),
|
159
|
-
F, num_bins, tile_size);
|
160
|
-
|
161
|
-
// Optional: check for kernel launch failure
|
162
|
-
cudaError_t err = cudaGetLastError();
|
163
|
-
if (err != cudaSuccess)
|
164
|
-
{
|
165
|
-
printf("CUDA kernel launch failed: %s\n", cudaGetErrorString(err));
|
166
|
-
}
|
167
|
-
}
|
168
|
-
|
169
|
-
__global__ void histogram_tiled_configurable_kernel(
|
170
|
-
const int8_t *__restrict__ bin_indices, // [N, F]
|
171
|
-
const float *__restrict__ gradients, // [N]
|
172
|
-
float *__restrict__ grad_hist, // [F * B]
|
173
|
-
float *__restrict__ hess_hist, // [F * B]
|
174
|
-
int64_t N, int64_t F, int64_t B,
|
175
|
-
int rows_per_thread)
|
176
|
-
{
|
177
|
-
int feat = blockIdx.x; // 1 block per feature
|
178
|
-
int row_start = (blockIdx.y * blockDim.x + threadIdx.x) * rows_per_thread;
|
179
|
-
|
180
|
-
extern __shared__ float shmem[];
|
181
|
-
float *sh_grad = shmem; // [B]
|
182
|
-
float *sh_hess = &sh_grad[B]; // [B]
|
183
|
-
|
184
|
-
// Initialize shared memory histograms
|
185
|
-
for (int b = threadIdx.x; b < B; b += blockDim.x)
|
186
|
-
{
|
187
|
-
sh_grad[b] = 0.0f;
|
188
|
-
sh_hess[b] = 0.0f;
|
189
|
-
}
|
190
|
-
__syncthreads();
|
191
|
-
|
192
|
-
// Each thread processes multiple rows
|
193
|
-
for (int r = 0; r < rows_per_thread; ++r)
|
194
|
-
{
|
195
|
-
int row = row_start + r;
|
196
|
-
if (row < N)
|
197
|
-
{
|
198
|
-
int8_t bin = bin_indices[row * F + feat];
|
199
|
-
if (bin >= 0 && bin < B)
|
200
|
-
{
|
201
|
-
atomicAdd(&sh_grad[bin], gradients[row]);
|
202
|
-
atomicAdd(&sh_hess[bin], 1.0f);
|
203
|
-
}
|
204
|
-
}
|
205
|
-
}
|
206
|
-
__syncthreads();
|
207
|
-
|
208
|
-
// One thread per bin writes results back to global memory
|
209
|
-
for (int b = threadIdx.x; b < B; b += blockDim.x)
|
210
|
-
{
|
211
|
-
int64_t idx = feat * B + b;
|
212
|
-
atomicAdd(&grad_hist[idx], sh_grad[b]);
|
213
|
-
atomicAdd(&hess_hist[idx], sh_hess[b]);
|
214
|
-
}
|
215
|
-
}
|
216
|
-
|
217
|
-
void launch_histogram_kernel_cuda_configurable(
|
218
|
-
const at::Tensor &bin_indices,
|
219
|
-
const at::Tensor &gradients,
|
220
|
-
at::Tensor &grad_hist,
|
221
|
-
at::Tensor &hess_hist,
|
222
|
-
int num_bins,
|
223
|
-
int threads_per_block = 256,
|
224
|
-
int rows_per_thread = 1)
|
225
|
-
{
|
226
|
-
|
227
|
-
int64_t N = bin_indices.size(0);
|
228
|
-
int64_t F = bin_indices.size(1);
|
229
|
-
|
230
|
-
int rows_per_block = threads_per_block * rows_per_thread;
|
231
|
-
int row_tiles = (N + rows_per_block - 1) / rows_per_block;
|
232
|
-
|
233
|
-
dim3 blocks(F, row_tiles); // grid.x = F, grid.y = row_tiles
|
234
|
-
dim3 threads(threads_per_block);
|
235
|
-
int shared_mem_bytes = 2 * num_bins * sizeof(float);
|
236
|
-
|
237
|
-
histogram_tiled_configurable_kernel<<<blocks, threads, shared_mem_bytes>>>(
|
238
|
-
bin_indices.data_ptr<int8_t>(),
|
239
|
-
gradients.data_ptr<float>(),
|
240
|
-
grad_hist.data_ptr<float>(),
|
241
|
-
hess_hist.data_ptr<float>(),
|
242
|
-
N, F, num_bins,
|
243
|
-
rows_per_thread);
|
244
|
-
|
245
|
-
cudaError_t err = cudaGetLastError();
|
246
|
-
if (err != cudaSuccess)
|
247
|
-
{
|
248
|
-
printf("CUDA kernel launch failed: %s\n", cudaGetErrorString(err));
|
249
|
-
}
|
250
|
-
}
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|