warpgbm 0.1.26__tar.gz → 0.1.27__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {warpgbm-0.1.26/warpgbm.egg-info → warpgbm-0.1.27}/PKG-INFO +1 -1
  2. {warpgbm-0.1.26 → warpgbm-0.1.27}/pyproject.toml +2 -2
  3. warpgbm-0.1.27/tests/full_numerai_test.py +67 -0
  4. warpgbm-0.1.27/tests/test_fit_predict_corr.py +52 -0
  5. warpgbm-0.1.27/version.txt +1 -0
  6. {warpgbm-0.1.26 → warpgbm-0.1.27}/warpgbm/core.py +78 -80
  7. warpgbm-0.1.27/warpgbm/cuda/histogram_kernel.cu +95 -0
  8. {warpgbm-0.1.26 → warpgbm-0.1.27}/warpgbm/cuda/node_kernel.cpp +3 -20
  9. warpgbm-0.1.27/warpgbm/metrics.py +10 -0
  10. {warpgbm-0.1.26 → warpgbm-0.1.27/warpgbm.egg-info}/PKG-INFO +1 -1
  11. {warpgbm-0.1.26 → warpgbm-0.1.27}/warpgbm.egg-info/SOURCES.txt +2 -0
  12. warpgbm-0.1.26/tests/test_fit_predict_corr.py +0 -58
  13. warpgbm-0.1.26/version.txt +0 -1
  14. warpgbm-0.1.26/warpgbm/cuda/histogram_kernel.cu +0 -250
  15. {warpgbm-0.1.26 → warpgbm-0.1.27}/LICENSE +0 -0
  16. {warpgbm-0.1.26 → warpgbm-0.1.27}/MANIFEST.in +0 -0
  17. {warpgbm-0.1.26 → warpgbm-0.1.27}/README.md +0 -0
  18. {warpgbm-0.1.26 → warpgbm-0.1.27}/setup.cfg +0 -0
  19. {warpgbm-0.1.26 → warpgbm-0.1.27}/setup.py +0 -0
  20. {warpgbm-0.1.26 → warpgbm-0.1.27}/tests/__init__.py +0 -0
  21. {warpgbm-0.1.26 → warpgbm-0.1.27}/tests/numerai_test.py +0 -0
  22. {warpgbm-0.1.26 → warpgbm-0.1.27}/warpgbm/__init__.py +0 -0
  23. {warpgbm-0.1.26 → warpgbm-0.1.27}/warpgbm/cuda/__init__.py +0 -0
  24. {warpgbm-0.1.26 → warpgbm-0.1.27}/warpgbm/cuda/best_split_kernel.cu +0 -0
  25. {warpgbm-0.1.26 → warpgbm-0.1.27}/warpgbm/cuda/binner.cu +0 -0
  26. {warpgbm-0.1.26 → warpgbm-0.1.27}/warpgbm/cuda/predict.cu +0 -0
  27. {warpgbm-0.1.26 → warpgbm-0.1.27}/warpgbm.egg-info/dependency_links.txt +0 -0
  28. {warpgbm-0.1.26 → warpgbm-0.1.27}/warpgbm.egg-info/requires.txt +0 -0
  29. {warpgbm-0.1.26 → warpgbm-0.1.27}/warpgbm.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: warpgbm
3
- Version: 0.1.26
3
+ Version: 0.1.27
4
4
  Summary: A fast GPU-accelerated Gradient Boosted Decision Tree library with PyTorch + CUDA
5
5
  License: GNU GENERAL PUBLIC LICENSE
6
6
  Version 3, 29 June 2007
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "warpgbm"
7
- version = "0.1.26"
7
+ version = "0.1.27"
8
8
  description = "A fast GPU-accelerated Gradient Boosted Decision Tree library with PyTorch + CUDA"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -13,5 +13,5 @@ dependencies = [
13
13
  "torch",
14
14
  "numpy",
15
15
  "tqdm",
16
- "scikit-learn"
16
+ "scikit-learn"
17
17
  ]
@@ -0,0 +1,67 @@
1
+ from numerapi import NumerAPI
2
+ import pandas as pd
3
+ import numpy as np
4
+ from warpgbm import WarpGBM
5
+ import time
6
+ from sklearn.metrics import mean_squared_error
7
+
8
+
9
+ def predict_in_chunks(model, X, chunk_size=100_000):
10
+ preds = []
11
+ for i in range(0, X.shape[0], chunk_size):
12
+ X_chunk = X[i : i + chunk_size]
13
+ preds.append(model.predict(X_chunk))
14
+ return np.concatenate(preds)
15
+
16
+
17
+ def test_numerai_data():
18
+ napi = NumerAPI()
19
+ napi.download_dataset("v5.0/train.parquet", "numerai_train.parquet")
20
+ napi.download_dataset("v5.0/validation.parquet", "numerai_validation.parquet")
21
+
22
+ data = pd.concat([
23
+ pd.read_parquet("numerai_train.parquet"),
24
+ pd.read_parquet("numerai_validation.parquet")
25
+ ])
26
+ features = [f for f in list(data) if "feature" in f]
27
+ target = "target"
28
+ data = data.loc[data[ target].isna() == False ]
29
+
30
+ X = data[features].astype("int8").values[:]
31
+ y = data[target].values
32
+
33
+ model = WarpGBM(
34
+ max_depth=3,
35
+ num_bins=5,
36
+ n_estimators=10,
37
+ learning_rate=1,
38
+ threads_per_block=64,
39
+ rows_per_thread=4,
40
+ colsample_bytree=0.8,
41
+ )
42
+
43
+ start_fit = time.time()
44
+ model.fit(
45
+ X,
46
+ y,
47
+ # era_id=era,
48
+ # X_eval=X,
49
+ # y_eval=y,
50
+ # eval_every_n_trees=10,
51
+ # early_stopping_rounds=1,
52
+ )
53
+ fit_time = time.time() - start_fit
54
+ print(f" Fit time: {fit_time:.3f} seconds")
55
+
56
+ start_pred = time.time()
57
+ preds = predict_in_chunks(model, X, chunk_size=500_000)
58
+ pred_time = time.time() - start_pred
59
+ print(f" Predict time: {pred_time:.3f} seconds")
60
+
61
+ corr = np.corrcoef(preds, y)[0, 1]
62
+ mse = mean_squared_error(preds, y)
63
+ print(f" Correlation: {corr:.4f}")
64
+ print(f" MSE: {mse:.4f}")
65
+
66
+ # assert corr > 0.68, f"In-sample correlation too low: {corr}"
67
+ # assert mse < 0.03, f"In-sample mse too high: {mse}"
@@ -0,0 +1,52 @@
1
+ import numpy as np
2
+ from warpgbm import WarpGBM
3
+ from sklearn.datasets import make_regression
4
+ import time
5
+ from sklearn.metrics import mean_squared_error
6
+
7
+
8
+ def test_fit_predictpytee_correlation():
9
+ np.random.seed(42)
10
+ N = 100_000
11
+ F = 1000
12
+ X, y = make_regression(n_samples=N, n_features=F, noise=0.1, random_state=42)
13
+ era = np.zeros(N, dtype=np.int32)
14
+ corrs = []
15
+ mses = []
16
+
17
+ model = WarpGBM(
18
+ max_depth=10,
19
+ num_bins=10,
20
+ n_estimators=100,
21
+ learning_rate=1,
22
+ threads_per_block=64,
23
+ rows_per_thread=4,
24
+ colsample_bytree=1.0,
25
+ )
26
+
27
+ start_fit = time.time()
28
+ model.fit(
29
+ X,
30
+ y,
31
+ era_id=era,
32
+ X_eval=X,
33
+ y_eval=y,
34
+ eval_every_n_trees=10,
35
+ early_stopping_rounds=1,
36
+ eval_metric="corr",
37
+ )
38
+ fit_time = time.time() - start_fit
39
+ print(f" Fit time: {fit_time:.3f} seconds")
40
+
41
+ start_pred = time.time()
42
+ preds = model.predict(X)
43
+ pred_time = time.time() - start_pred
44
+ print(f" Predict time: {pred_time:.3f} seconds")
45
+
46
+ corr = np.corrcoef(preds, y)[0, 1]
47
+ mse = mean_squared_error(preds, y)
48
+ print(f" Correlation: {corr:.4f}")
49
+ print(f" MSE: {mse:.4f}")
50
+
51
+ assert (corr > 0.9), f"In-sample correlation too low: {corrs}"
52
+ assert (mse < 2), f"In-sample mse too high: {mses}"
@@ -0,0 +1 @@
1
+ 0.1.27
@@ -1,19 +1,14 @@
1
1
  import torch
2
2
  import numpy as np
3
3
  from sklearn.base import BaseEstimator, RegressorMixin
4
+ from sklearn.metrics import mean_squared_log_error
4
5
  from warpgbm.cuda import node_kernel
6
+ from warpgbm.metrics import rmsle_torch
5
7
  from tqdm import tqdm
6
8
  from typing import Tuple
7
9
  from torch import Tensor
8
10
  import gc
9
11
 
10
- histogram_kernels = {
11
- "hist1": node_kernel.compute_histogram,
12
- "hist2": node_kernel.compute_histogram2,
13
- "hist3": node_kernel.compute_histogram3,
14
- }
15
-
16
-
17
12
  class WarpGBM(BaseEstimator, RegressorMixin):
18
13
  def __init__(
19
14
  self,
@@ -23,8 +18,6 @@ class WarpGBM(BaseEstimator, RegressorMixin):
23
18
  n_estimators=100,
24
19
  min_child_weight=20,
25
20
  min_split_gain=0.0,
26
- verbosity=True,
27
- histogram_computer="hist3",
28
21
  threads_per_block=64,
29
22
  rows_per_thread=4,
30
23
  L2_reg=1e-6,
@@ -40,7 +33,6 @@ class WarpGBM(BaseEstimator, RegressorMixin):
40
33
  n_estimators=n_estimators,
41
34
  min_child_weight=min_child_weight,
42
35
  min_split_gain=min_split_gain,
43
- histogram_computer=histogram_computer,
44
36
  threads_per_block=threads_per_block,
45
37
  rows_per_thread=rows_per_thread,
46
38
  L2_reg=L2_reg,
@@ -68,7 +60,6 @@ class WarpGBM(BaseEstimator, RegressorMixin):
68
60
  self.min_child_weight = min_child_weight
69
61
  self.min_split_gain = min_split_gain
70
62
  self.best_bin = torch.tensor([-1], dtype=torch.int32, device=self.device)
71
- self.compute_histogram = histogram_kernels[histogram_computer]
72
63
  self.threads_per_block = threads_per_block
73
64
  self.rows_per_thread = rows_per_thread
74
65
  self.L2_reg = L2_reg
@@ -128,10 +119,6 @@ class WarpGBM(BaseEstimator, RegressorMixin):
128
119
  )
129
120
  if kwargs["L2_reg"] < 0 or kwargs["L1_reg"] < 0:
130
121
  raise ValueError("L2_reg and L1_reg must be non-negative.")
131
- if kwargs["histogram_computer"] not in histogram_kernels:
132
- raise ValueError(
133
- f"Invalid histogram_computer: {kwargs['histogram_computer']}. Choose from {list(histogram_kernels.keys())}."
134
- )
135
122
  if kwargs["colsample_bytree"] <= 0 or kwargs["colsample_bytree"] > 1:
136
123
  raise ValueError(
137
124
  f"Invalid colsample_bytree: {kwargs['colsample_bytree']}. Must be a float value > 0 and <= 1."
@@ -206,9 +193,9 @@ class WarpGBM(BaseEstimator, RegressorMixin):
206
193
  # No early stopping = set to "never trigger"
207
194
  early_stopping_rounds = self.n_estimators + 1
208
195
 
209
- if eval_metric not in ["mse", "corr"]:
196
+ if eval_metric not in ["mse", "corr", "rmsle"]:
210
197
  raise ValueError(
211
- f"Invalid eval_metric: {eval_metric}. Choose 'mse' or 'corr'."
198
+ f"Invalid eval_metric: {eval_metric}. Choose 'mse' or 'corr', 'rmsle'."
212
199
  )
213
200
 
214
201
  return early_stopping_rounds # May have been defaulted here
@@ -237,14 +224,16 @@ class WarpGBM(BaseEstimator, RegressorMixin):
237
224
  )
238
225
  self.num_samples, self.num_features = X.shape
239
226
  self.gradients = torch.zeros_like(self.Y_gpu)
240
- self.root_node_indices = torch.arange(self.num_samples, device=self.device)
227
+ self.root_node_indices = torch.arange(self.num_samples, device=self.device, dtype=torch.int32)
241
228
  self.base_prediction = self.Y_gpu.mean().item()
242
229
  self.gradients += self.base_prediction
243
- self.best_gains = torch.zeros(self.num_features, device=self.device)
244
- self.best_bins = torch.zeros(
245
- self.num_features, device=self.device, dtype=torch.int32
246
- )
247
- self.feature_indices = torch.arange(self.num_features, device=self.device)
230
+ if self.colsample_bytree < 1.0:
231
+ k = max(1, int(self.colsample_bytree * self.num_features))
232
+ else:
233
+ k = self.num_features
234
+ self.best_gains = torch.zeros(k, device=self.device)
235
+ self.best_bins = torch.zeros(k, device=self.device, dtype=torch.int32)
236
+ self.feature_indices = torch.arange(self.num_features, device=self.device, dtype=torch.int32)
248
237
 
249
238
  # ─── Optional Eval Set ───
250
239
  if X_eval is not None and y_eval is not None:
@@ -273,50 +262,47 @@ class WarpGBM(BaseEstimator, RegressorMixin):
273
262
  def preprocess_gpu_data(self, X_np, Y_np, era_id_np):
274
263
  with torch.no_grad():
275
264
  self.num_samples, self.num_features = X_np.shape
265
+
276
266
  Y_gpu = torch.from_numpy(Y_np).type(torch.float32).to(self.device)
277
- era_id_gpu = torch.from_numpy(era_id_np).type(torch.int32).to(self.device)
278
- is_integer_type = np.issubdtype(X_np.dtype, np.integer)
279
- if is_integer_type:
280
- max_vals = X_np.max(axis=0)
281
- if np.all(max_vals < self.num_bins):
282
- print(
283
- "Detected pre-binned integer input — skipping quantile binning."
284
- )
285
- bin_indices = (
286
- torch.from_numpy(X_np)
287
- .to(self.device)
288
- .contiguous()
289
- .to(torch.int8)
290
- )
291
267
 
292
- # We'll store None or an empty tensor in self.bin_edges
293
- # to indicate that we skip binning at predict-time
294
- bin_edges = torch.arange(
295
- 1, self.num_bins, dtype=torch.float32
296
- ).repeat(self.num_features, 1)
297
- bin_edges = bin_edges.to(self.device)
298
- unique_eras, era_indices = torch.unique(
299
- era_id_gpu, return_inverse=True
300
- )
301
- return bin_indices, era_indices, bin_edges, unique_eras, Y_gpu
302
- else:
303
- print(
304
- "Integer input detected, but values exceed num_bins — falling back to quantile binning."
305
- )
268
+ era_id_gpu = torch.from_numpy(era_id_np).type(torch.int32).to(self.device)
306
269
 
307
270
  bin_indices = torch.empty(
308
271
  (self.num_samples, self.num_features), dtype=torch.int8, device="cuda"
309
272
  )
273
+
274
+ is_integer_type = np.issubdtype(X_np.dtype, np.integer)
275
+ max_vals = X_np.max(axis=0)
276
+
277
+ if is_integer_type and np.all(max_vals < self.num_bins):
278
+ print(
279
+ "Detected pre-binned integer input — skipping quantile binning."
280
+ )
281
+ for f in range(self.num_features):
282
+ bin_indices[:,f] = torch.as_tensor( X_np[:, f], device=self.device).contiguous()
283
+ # bin_indices = X_np.to("cuda", non_blocking=True).contiguous()
284
+
285
+ # We'll store None or an empty tensor in self.bin_edges
286
+ # to indicate that we skip binning at predict-time
287
+ bin_edges = torch.arange(
288
+ 1, self.num_bins, dtype=torch.float32
289
+ ).repeat(self.num_features, 1)
290
+ bin_edges = bin_edges.to(self.device)
291
+ unique_eras, era_indices = torch.unique(
292
+ era_id_gpu, return_inverse=True
293
+ )
294
+ return bin_indices, era_indices, bin_edges, unique_eras, Y_gpu
295
+
296
+ print("quantile binning.")
297
+
310
298
  bin_edges = torch.empty(
311
299
  (self.num_features, self.num_bins - 1),
312
300
  dtype=torch.float32,
313
301
  device="cuda",
314
302
  )
315
303
 
316
- X_np = torch.from_numpy(X_np).to(torch.float32).pin_memory()
317
-
318
304
  for f in range(self.num_features):
319
- X_f = X_np[:, f].to("cuda", non_blocking=True)
305
+ X_f = torch.as_tensor( X_np[:, f], device=self.device, dtype=torch.float32 ).contiguous()
320
306
  quantiles = torch.linspace(
321
307
  0, 1, self.num_bins + 1, device="cuda", dtype=X_f.dtype
322
308
  )[1:-1]
@@ -331,17 +317,19 @@ class WarpGBM(BaseEstimator, RegressorMixin):
331
317
  unique_eras, era_indices = torch.unique(era_id_gpu, return_inverse=True)
332
318
  return bin_indices, era_indices, bin_edges, unique_eras, Y_gpu
333
319
 
334
- def compute_histograms(self, bin_indices_sub, gradients):
320
+ def compute_histograms(self, sample_indices, feature_indices):
335
321
  grad_hist = torch.zeros(
336
- (self.num_features, self.num_bins), device=self.device, dtype=torch.float32
322
+ (len(feature_indices), self.num_bins), device=self.device, dtype=torch.float32
337
323
  )
338
324
  hess_hist = torch.zeros(
339
- (self.num_features, self.num_bins), device=self.device, dtype=torch.float32
325
+ (len(feature_indices), self.num_bins), device=self.device, dtype=torch.float32
340
326
  )
341
327
 
342
- self.compute_histogram(
343
- bin_indices_sub,
344
- gradients,
328
+ node_kernel.compute_histogram3(
329
+ self.bin_indices,
330
+ self.residual,
331
+ sample_indices,
332
+ feature_indices,
345
333
  grad_hist,
346
334
  hess_hist,
347
335
  self.num_bins,
@@ -364,6 +352,9 @@ class WarpGBM(BaseEstimator, RegressorMixin):
364
352
 
365
353
  if torch.all(self.best_bins == -1):
366
354
  return -1, -1 # No valid split found
355
+
356
+ # print(self.best_bins)
357
+ # print(self.best_gains)
367
358
 
368
359
  f = torch.argmax(self.best_gains).item()
369
360
  b = self.best_bins[f].item()
@@ -381,30 +372,38 @@ class WarpGBM(BaseEstimator, RegressorMixin):
381
372
  gradient_histogram, hessian_histogram
382
373
  )
383
374
 
375
+ # print(local_feature, best_bin)
376
+
384
377
  if local_feature == -1:
385
378
  leaf_value = self.residual[node_indices].mean()
386
379
  self.gradients[node_indices] += self.learning_rate * leaf_value
387
380
  return {"leaf_value": leaf_value.item(), "samples": parent_size}
388
-
381
+
382
+ # print("DEBUG SHAPES -> bin_indices:", self.bin_indices.shape,
383
+ # "| node_indices max:", node_indices.max().item(),
384
+ # "| local_feature:", local_feature,
385
+ # "| feat_indices_tree len:", len(self.feat_indices_tree),
386
+ # "| feat index:", self.feat_indices_tree[local_feature])
387
+
389
388
  split_mask = self.bin_indices[node_indices, self.feat_indices_tree[local_feature]] <= best_bin
390
389
  left_indices = node_indices[split_mask]
391
390
  right_indices = node_indices[~split_mask]
392
391
 
392
+ # print("DEBUG SHAPES -> left_indices:", left_indices.shape,
393
+ # "| right_indices:", right_indices.shape,
394
+ # "| parent_size:", parent_size,
395
+ # "| local_feature:", local_feature,
396
+ # "| best_bin:", best_bin)
397
+
393
398
  left_size = left_indices.numel()
394
399
  right_size = right_indices.numel()
395
400
 
396
401
  if left_size <= right_size:
397
- grad_hist_left, hess_hist_left = self.compute_histograms(
398
- self.bin_indices.index_select(0, left_indices).index_select(1, self.feat_indices_tree)
399
- , self.residual[left_indices]
400
- )
402
+ grad_hist_left, hess_hist_left = self.compute_histograms( left_indices, self.feat_indices_tree )
401
403
  grad_hist_right = gradient_histogram - grad_hist_left
402
404
  hess_hist_right = hessian_histogram - hess_hist_left
403
405
  else:
404
- grad_hist_right, hess_hist_right = self.compute_histograms(
405
- self.bin_indices.index_select(0, right_indices).index_select(1, self.feat_indices_tree)
406
- , self.residual[right_indices]
407
- )
406
+ grad_hist_right, hess_hist_right = self.compute_histograms( right_indices, self.feat_indices_tree )
408
407
  grad_hist_left = gradient_histogram - grad_hist_right
409
408
  hess_hist_left = hessian_histogram - hess_hist_right
410
409
 
@@ -428,6 +427,8 @@ class WarpGBM(BaseEstimator, RegressorMixin):
428
427
  return ((y_true - y_pred) ** 2).mean().item()
429
428
  elif self.eval_metric == "corr":
430
429
  return 1 - torch.corrcoef(torch.vstack([y_true, y_pred]))[0, 1].item()
430
+ elif self.eval_metric == "rmsle":
431
+ return rmsle_torch(y_true, y_pred).item()
431
432
  else:
432
433
  raise ValueError(f"Invalid eval_metric: {self.eval_metric}.")
433
434
 
@@ -467,13 +468,9 @@ class WarpGBM(BaseEstimator, RegressorMixin):
467
468
  self.residual = self.Y_gpu - self.gradients
468
469
 
469
470
  if self.colsample_bytree < 1.0:
470
- self.feat_indices_tree = torch.randperm(
471
- self.num_features, device=self.device
472
- )[:k]
471
+ self.feat_indices_tree = torch.randperm(self.num_features, device=self.device, dtype=torch.int32)[:k]
473
472
 
474
- self.root_gradient_histogram, self.root_hessian_histogram = (
475
- self.compute_histograms(self.bin_indices[:, self.feat_indices_tree], self.residual)
476
- )
473
+ self.root_gradient_histogram, self.root_hessian_histogram = self.compute_histograms( self.root_node_indices, self.feat_indices_tree )
477
474
 
478
475
  tree = self.grow_tree(
479
476
  self.root_gradient_histogram,
@@ -491,14 +488,13 @@ class WarpGBM(BaseEstimator, RegressorMixin):
491
488
  print("Finished training forest.")
492
489
 
493
490
  def bin_data_with_existing_edges(self, X_np):
494
- X_tensor = torch.from_numpy(X_np).type(torch.float32).pin_memory()
495
- num_samples = X_tensor.size(0)
491
+ num_samples = X_np.shape[0]
496
492
  bin_indices = torch.zeros(
497
493
  (num_samples, self.num_features), dtype=torch.int8, device=self.device
498
494
  )
499
495
  with torch.no_grad():
500
496
  for f in range(self.num_features):
501
- X_f = X_tensor[:, f].to(self.device, non_blocking=True)
497
+ X_f = torch.as_tensor( X_np[:, f], device=self.device, dtype=torch.float32 ).contiguous()
502
498
  bin_edges_f = self.bin_edges[f]
503
499
  bin_indices_f = bin_indices[:, f].contiguous()
504
500
  node_kernel.custom_cuda_binner(X_f, bin_edges_f, bin_indices_f)
@@ -537,9 +533,11 @@ class WarpGBM(BaseEstimator, RegressorMixin):
537
533
  is_prebinned = False
538
534
 
539
535
  if is_prebinned:
540
- bin_indices = (
541
- torch.from_numpy(X_np).to(self.device).contiguous().to(torch.int8)
536
+ bin_indices = torch.empty(
537
+ X_np.shape, dtype=torch.int8, device="cuda"
542
538
  )
539
+ for f in range(self.num_features):
540
+ bin_indices[:,f] = torch.as_tensor( X_np[:, f], device=self.device).contiguous()
543
541
  else:
544
542
  bin_indices = self.bin_data_with_existing_edges(X_np)
545
543
  return bin_indices
@@ -0,0 +1,95 @@
1
+ #include <cuda.h>
2
+ #include <cuda_runtime.h>
3
+ #include <torch/extension.h>
4
+
5
+ __global__ void histogram_tiled_configurable_kernel(
6
+ const int8_t *__restrict__ bin_indices, // [N, F]
7
+ const float *__restrict__ residuals, // [N]
8
+ const int32_t *__restrict__ sample_indices, // [N]
9
+ const int32_t *__restrict__ feature_indices, // [F]
10
+ float *__restrict__ grad_hist, // [F * B]
11
+ float *__restrict__ hess_hist, // [F * B]
12
+ int64_t N, int64_t F, int64_t B,
13
+ int rows_per_thread)
14
+ {
15
+ int hist_feat_idx = blockIdx.x;
16
+ int feat = feature_indices[ hist_feat_idx ]; // 1 block per feature
17
+ int row_start = (blockIdx.y * blockDim.x + threadIdx.x) * rows_per_thread;
18
+
19
+ extern __shared__ float shmem[];
20
+ float *sh_grad = shmem; // [B]
21
+ float *sh_hess = &sh_grad[B]; // [B]
22
+
23
+ // Initialize shared memory histograms
24
+ for (int b = threadIdx.x; b < B; b += blockDim.x)
25
+ {
26
+ sh_grad[b] = 0.0f;
27
+ sh_hess[b] = 0.0f;
28
+ }
29
+ __syncthreads();
30
+
31
+ // Each thread processes multiple rows
32
+ for (int r = 0; r < rows_per_thread; ++r)
33
+ {
34
+ int row = row_start + r;
35
+ if (row < N)
36
+ {
37
+ int sample = sample_indices[row];
38
+ int8_t bin = bin_indices[sample * F + feat];
39
+ if (bin >= 0 && bin < B)
40
+ {
41
+ atomicAdd(&sh_grad[bin], residuals[sample]);
42
+ atomicAdd(&sh_hess[bin], 1.0f);
43
+ }
44
+ }
45
+ }
46
+ __syncthreads();
47
+
48
+ // One thread per bin writes results back to global memory
49
+ for (int b = threadIdx.x; b < B; b += blockDim.x)
50
+ {
51
+ int64_t idx = hist_feat_idx * B + b;
52
+ atomicAdd(&grad_hist[idx], sh_grad[b]);
53
+ atomicAdd(&hess_hist[idx], sh_hess[b]);
54
+ }
55
+ }
56
+
57
+ void launch_histogram_kernel_cuda_configurable(
58
+ const at::Tensor &bin_indices,
59
+ const at::Tensor &residuals,
60
+ const at::Tensor &sample_indices,
61
+ const at::Tensor &feature_indices,
62
+ at::Tensor &grad_hist,
63
+ at::Tensor &hess_hist,
64
+ int num_bins,
65
+ int threads_per_block = 256,
66
+ int rows_per_thread = 1)
67
+ {
68
+
69
+ int64_t N = sample_indices.size(0);
70
+ int64_t F = feature_indices.size(0);
71
+ int num_features_master = bin_indices.size(1);
72
+
73
+ int64_t rows_per_block = threads_per_block * rows_per_thread;
74
+ int64_t row_tiles = (N + rows_per_block - 1) / rows_per_block;
75
+
76
+ dim3 blocks(F, row_tiles); // grid.x = F, grid.y = row_tiles
77
+ dim3 threads(threads_per_block);
78
+ int shared_mem_bytes = 2 * num_bins * sizeof(float);
79
+
80
+ histogram_tiled_configurable_kernel<<<blocks, threads, shared_mem_bytes>>>(
81
+ bin_indices.data_ptr<int8_t>(),
82
+ residuals.data_ptr<float>(),
83
+ sample_indices.data_ptr<int32_t>(),
84
+ feature_indices.data_ptr<int32_t>(),
85
+ grad_hist.data_ptr<float>(),
86
+ hess_hist.data_ptr<float>(),
87
+ N, num_features_master, num_bins,
88
+ rows_per_thread);
89
+
90
+ cudaError_t err = cudaGetLastError();
91
+ if (err != cudaSuccess)
92
+ {
93
+ printf("CUDA kernel launch failed: %s\n", cudaGetErrorString(err));
94
+ }
95
+ }
@@ -2,23 +2,6 @@
2
2
  #include <vector>
3
3
 
4
4
  // Declare the function from histogram_kernel.cu
5
- void launch_histogram_kernel_cuda(
6
- const at::Tensor &bin_indices,
7
- const at::Tensor &gradients,
8
- at::Tensor &grad_hist,
9
- at::Tensor &hess_hist,
10
- int num_bins,
11
- int threads_per_block = 256,
12
- int rows_per_thread = 1);
13
-
14
- void launch_histogram_kernel_cuda_2(
15
- const at::Tensor &bin_indices, // int8 [N, F]
16
- const at::Tensor &gradients, // float32 [N]
17
- at::Tensor &grad_hist, // float32 [F * B]
18
- at::Tensor &hess_hist, // float32 [F * B]
19
- int num_bins,
20
- int threads_per_block = 256,
21
- int rows_per_thread = 1);
22
5
 
23
6
  void launch_best_split_kernel_cuda(
24
7
  const at::Tensor &G, // [F x B]
@@ -32,7 +15,9 @@ void launch_best_split_kernel_cuda(
32
15
 
33
16
  void launch_histogram_kernel_cuda_configurable(
34
17
  const at::Tensor &bin_indices,
35
- const at::Tensor &gradients,
18
+ const at::Tensor &residual,
19
+ const at::Tensor &sample_indices,
20
+ const at::Tensor &feature_indices,
36
21
  at::Tensor &grad_hist,
37
22
  at::Tensor &hess_hist,
38
23
  int num_bins,
@@ -54,8 +39,6 @@ void predict_with_forest(
54
39
  // Bindings
55
40
  PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
56
41
  {
57
- m.def("compute_histogram", &launch_histogram_kernel_cuda, "Histogram (CUDA)");
58
- m.def("compute_histogram2", &launch_histogram_kernel_cuda_2, "Histogram (CUDA) 2");
59
42
  m.def("compute_histogram3", &launch_histogram_kernel_cuda_configurable, "Histogram Feature Shared Mem");
60
43
  m.def("compute_split", &launch_best_split_kernel_cuda, "Best Split (CUDA)");
61
44
  m.def("custom_cuda_binner", &launch_bin_column_kernel, "Custom CUDA binning kernel");
@@ -0,0 +1,10 @@
1
+ # warpgbm/metrics.py
2
+
3
+ import torch
4
+
5
+ def rmsle_torch(y_true, y_pred, eps=1e-7):
6
+ y_true = torch.clamp(y_true, min=0)
7
+ y_pred = torch.clamp(y_pred, min=0)
8
+ log_true = torch.log1p(y_true + eps)
9
+ log_pred = torch.log1p(y_pred + eps)
10
+ return torch.sqrt(torch.mean((log_true - log_pred) ** 2))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: warpgbm
3
- Version: 0.1.26
3
+ Version: 0.1.27
4
4
  Summary: A fast GPU-accelerated Gradient Boosted Decision Tree library with PyTorch + CUDA
5
5
  License: GNU GENERAL PUBLIC LICENSE
6
6
  Version 3, 29 June 2007
@@ -5,10 +5,12 @@ pyproject.toml
5
5
  setup.py
6
6
  version.txt
7
7
  tests/__init__.py
8
+ tests/full_numerai_test.py
8
9
  tests/numerai_test.py
9
10
  tests/test_fit_predict_corr.py
10
11
  warpgbm/__init__.py
11
12
  warpgbm/core.py
13
+ warpgbm/metrics.py
12
14
  warpgbm.egg-info/PKG-INFO
13
15
  warpgbm.egg-info/SOURCES.txt
14
16
  warpgbm.egg-info/dependency_links.txt
@@ -1,58 +0,0 @@
1
- import numpy as np
2
- from warpgbm import WarpGBM
3
- from sklearn.datasets import make_regression
4
- import time
5
- from sklearn.metrics import mean_squared_error
6
-
7
-
8
- def test_fit_predictpytee_correlation():
9
- np.random.seed(42)
10
- N = 100_000
11
- F = 1000
12
- X, y = make_regression(n_samples=N, n_features=F, noise=0.1, random_state=42)
13
- era = np.zeros(N, dtype=np.int32)
14
- corrs = []
15
- mses = []
16
-
17
- for hist_type in ["hist1", "hist2", "hist3"]:
18
- print(f"\nTesting histogram method: {hist_type}")
19
-
20
- model = WarpGBM(
21
- max_depth=10,
22
- num_bins=10,
23
- n_estimators=100,
24
- learning_rate=1,
25
- verbosity=False,
26
- histogram_computer=hist_type,
27
- threads_per_block=64,
28
- rows_per_thread=4,
29
- )
30
-
31
- start_fit = time.time()
32
- model.fit(
33
- X,
34
- y,
35
- era_id=era,
36
- X_eval=X,
37
- y_eval=y,
38
- eval_every_n_trees=10,
39
- early_stopping_rounds=1,
40
- eval_metric="corr",
41
- )
42
- fit_time = time.time() - start_fit
43
- print(f" Fit time: {fit_time:.3f} seconds")
44
-
45
- start_pred = time.time()
46
- preds = model.predict(X)
47
- pred_time = time.time() - start_pred
48
- print(f" Predict time: {pred_time:.3f} seconds")
49
-
50
- corr = np.corrcoef(preds, y)[0, 1]
51
- mse = mean_squared_error(preds, y)
52
- print(f" Correlation: {corr:.4f}")
53
- print(f" MSE: {mse:.4f}")
54
- corrs.append(corr)
55
- mses.append(mse)
56
-
57
- assert (np.array(corrs) > 0.9).all(), f"In-sample correlation too low: {corrs}"
58
- assert (np.array(mses) < 2).all(), f"In-sample mse too high: {mses}"
@@ -1 +0,0 @@
1
- 0.1.26
@@ -1,250 +0,0 @@
1
- #include <cuda.h>
2
- #include <cuda_runtime.h>
3
- #include <torch/extension.h>
4
-
5
- #define F_TILE 128 // Number of features processed per block (tile)
6
-
7
- // Each block processes a tile of features (of size up to F_TILE) and a chunk of samples.
8
- __global__ void histogram_kernel_shared_sample(
9
- const int8_t *__restrict__ bin_indices, // [N, F] bin indices
10
- const float *__restrict__ gradients, // [N] gradient values
11
- float *__restrict__ grad_hist, // [F * B] global gradient histogram (flattened)
12
- float *__restrict__ hess_hist, // [F * B] global hessian histogram (flattened)
13
- int64_t N, int64_t F, int64_t B)
14
- {
15
- // Use dynamic shared memory to hold the histogram for a tile.
16
- // Allocate 2 arrays: one for gradients and one for hessians.
17
- extern __shared__ float shmem[];
18
- float *shared_grad = shmem; // size: tile_features * B floats
19
- float *shared_hess = shmem + (F_TILE * B); // same size
20
-
21
- int tid = threadIdx.x; // Use a 1D block (for sample processing)
22
- int block_size = blockDim.x;
23
-
24
- // Each block is assigned a tile of features:
25
- int feature_offset = blockIdx.x * F_TILE;
26
- // Adjust tile width if we're near the end of the feature dimension.
27
- int tile_features = (feature_offset + F_TILE > F) ? (F - feature_offset) : F_TILE;
28
- int tile_size = tile_features * B; // total number of bins in this feature tile
29
-
30
- // Initialize the tile’s shared memory histograms.
31
- for (int i = tid; i < tile_size; i += block_size)
32
- {
33
- shared_grad[i] = 0.0f;
34
- shared_hess[i] = 0.0f;
35
- }
36
- __syncthreads();
37
-
38
- // Each block also covers a chunk of samples. Determine the sample index
39
- int sample = blockIdx.y * block_size + tid;
40
- if (sample < N)
41
- {
42
- // For each feature in this tile, compute the bin and update shared histograms.
43
- for (int j = 0; j < tile_features; j++)
44
- {
45
- // Global feature index.
46
- int f_idx = feature_offset + j;
47
- int64_t idx = sample * F + f_idx; // index into the [N, F] bin_indices tensor
48
- int8_t b = bin_indices[idx]; // get bin index
49
- if (b >= 0 && b < B)
50
- {
51
- int shared_idx = j * B + b; // index into the tile histogram in shared memory
52
- // Using atomics because several threads may update the same bin.
53
- atomicAdd(&shared_grad[shared_idx], gradients[sample]);
54
- atomicAdd(&shared_hess[shared_idx], 1.0f);
55
- }
56
- }
57
- }
58
- __syncthreads();
59
-
60
- // Flush the per-tile histograms from shared memory to global memory.
61
- // Each bin in the tile is added to the global histogram (which is sized [F, B]).
62
- for (int i = tid; i < tile_size; i += block_size)
63
- {
64
- int local_feature = i / B; // feature index relative to the tile
65
- int bin = i % B; // bin index
66
- int f_idx = feature_offset + local_feature;
67
- if (f_idx < F)
68
- {
69
- int global_idx = f_idx * B + bin;
70
- atomicAdd(&grad_hist[global_idx], shared_grad[i]);
71
- atomicAdd(&hess_hist[global_idx], shared_hess[i]);
72
- }
73
- }
74
- }
75
-
76
- void launch_histogram_kernel_cuda(
77
- const at::Tensor &bin_indices, // [N, F] int8 tensor
78
- const at::Tensor &gradients, // [N] float tensor
79
- at::Tensor &grad_hist, // [F * B] float tensor (preallocated)
80
- at::Tensor &hess_hist, // [F * B] float tensor (preallocated)
81
- int num_bins,
82
- int threads_per_block = 256,
83
- int rows_per_thread = 1)
84
- {
85
- int64_t N = bin_indices.size(0);
86
- int64_t F = bin_indices.size(1);
87
- int64_t B = num_bins;
88
-
89
- // Define grid and block dimensions.
90
- // blockDim.x: number of threads per block (for processing samples).
91
- // gridDim.x: number of feature tiles.
92
- int grid_x = (F + F_TILE - 1) / F_TILE;
93
- // gridDim.y: number of sample chunks.
94
- int grid_y = (N + threads_per_block - 1) / threads_per_block;
95
- dim3 blocks(grid_x, grid_y);
96
- dim3 threads(threads_per_block);
97
-
98
- // Calculate shared memory size:
99
- // We allocate 2 arrays of size (F_TILE * B) floats (one for grad and one for hess).
100
- size_t shared_mem_size = 2 * F_TILE * B * sizeof(float);
101
-
102
- histogram_kernel_shared_sample<<<blocks, threads, shared_mem_size>>>(
103
- bin_indices.data_ptr<int8_t>(),
104
- gradients.data_ptr<float>(),
105
- grad_hist.data_ptr<float>(),
106
- hess_hist.data_ptr<float>(),
107
- N, F, B);
108
- }
109
-
110
- // CUDA kernel: tiled, 64-bit safe
111
- __global__ void histogram_tiled_kernel(
112
- const int8_t *__restrict__ bin_indices, // [N, F]
113
- const float *__restrict__ gradients, // [N]
114
- float *__restrict__ grad_hist, // [F * B]
115
- float *__restrict__ hess_hist, // [F * B]
116
- int64_t F, int64_t B, int64_t tile_size)
117
- {
118
- int64_t feature_tiles = (F + tile_size - 1) / tile_size;
119
- int64_t row = static_cast<int64_t>(blockIdx.x) / feature_tiles;
120
- int64_t tile = static_cast<int64_t>(blockIdx.x) % feature_tiles;
121
- int64_t feat = tile * tile_size + threadIdx.x;
122
-
123
- if (feat >= F)
124
- return;
125
-
126
- int8_t bin = bin_indices[row * F + feat];
127
- if (bin >= 0 && bin < B)
128
- {
129
- int64_t idx = feat * B + bin;
130
- atomicAdd(&grad_hist[idx], gradients[row]);
131
- atomicAdd(&hess_hist[idx], 1.0f);
132
- }
133
- }
134
-
135
- // Host function exposed to PyTorch
136
- void launch_histogram_kernel_cuda_2(
137
- const at::Tensor &bin_indices, // int8 [N, F]
138
- const at::Tensor &gradients, // float32 [N]
139
- at::Tensor &grad_hist, // float32 [F * B]
140
- at::Tensor &hess_hist, // float32 [F * B]
141
- int num_bins,
142
- int threads_per_block = 256,
143
- int rows_per_thread = 1)
144
- {
145
-
146
- int64_t N = bin_indices.size(0);
147
- int64_t F = bin_indices.size(1);
148
- int64_t tile_size = threads_per_block;
149
- int64_t feature_tiles = (F + tile_size - 1) / tile_size;
150
- int64_t total_blocks = N * feature_tiles;
151
-
152
- histogram_tiled_kernel<<<
153
- static_cast<int>(total_blocks),
154
- static_cast<int>(tile_size)>>>(
155
- bin_indices.data_ptr<int8_t>(),
156
- gradients.data_ptr<float>(),
157
- grad_hist.data_ptr<float>(),
158
- hess_hist.data_ptr<float>(),
159
- F, num_bins, tile_size);
160
-
161
- // Optional: check for kernel launch failure
162
- cudaError_t err = cudaGetLastError();
163
- if (err != cudaSuccess)
164
- {
165
- printf("CUDA kernel launch failed: %s\n", cudaGetErrorString(err));
166
- }
167
- }
168
-
169
- __global__ void histogram_tiled_configurable_kernel(
170
- const int8_t *__restrict__ bin_indices, // [N, F]
171
- const float *__restrict__ gradients, // [N]
172
- float *__restrict__ grad_hist, // [F * B]
173
- float *__restrict__ hess_hist, // [F * B]
174
- int64_t N, int64_t F, int64_t B,
175
- int rows_per_thread)
176
- {
177
- int feat = blockIdx.x; // 1 block per feature
178
- int row_start = (blockIdx.y * blockDim.x + threadIdx.x) * rows_per_thread;
179
-
180
- extern __shared__ float shmem[];
181
- float *sh_grad = shmem; // [B]
182
- float *sh_hess = &sh_grad[B]; // [B]
183
-
184
- // Initialize shared memory histograms
185
- for (int b = threadIdx.x; b < B; b += blockDim.x)
186
- {
187
- sh_grad[b] = 0.0f;
188
- sh_hess[b] = 0.0f;
189
- }
190
- __syncthreads();
191
-
192
- // Each thread processes multiple rows
193
- for (int r = 0; r < rows_per_thread; ++r)
194
- {
195
- int row = row_start + r;
196
- if (row < N)
197
- {
198
- int8_t bin = bin_indices[row * F + feat];
199
- if (bin >= 0 && bin < B)
200
- {
201
- atomicAdd(&sh_grad[bin], gradients[row]);
202
- atomicAdd(&sh_hess[bin], 1.0f);
203
- }
204
- }
205
- }
206
- __syncthreads();
207
-
208
- // One thread per bin writes results back to global memory
209
- for (int b = threadIdx.x; b < B; b += blockDim.x)
210
- {
211
- int64_t idx = feat * B + b;
212
- atomicAdd(&grad_hist[idx], sh_grad[b]);
213
- atomicAdd(&hess_hist[idx], sh_hess[b]);
214
- }
215
- }
216
-
217
- void launch_histogram_kernel_cuda_configurable(
218
- const at::Tensor &bin_indices,
219
- const at::Tensor &gradients,
220
- at::Tensor &grad_hist,
221
- at::Tensor &hess_hist,
222
- int num_bins,
223
- int threads_per_block = 256,
224
- int rows_per_thread = 1)
225
- {
226
-
227
- int64_t N = bin_indices.size(0);
228
- int64_t F = bin_indices.size(1);
229
-
230
- int rows_per_block = threads_per_block * rows_per_thread;
231
- int row_tiles = (N + rows_per_block - 1) / rows_per_block;
232
-
233
- dim3 blocks(F, row_tiles); // grid.x = F, grid.y = row_tiles
234
- dim3 threads(threads_per_block);
235
- int shared_mem_bytes = 2 * num_bins * sizeof(float);
236
-
237
- histogram_tiled_configurable_kernel<<<blocks, threads, shared_mem_bytes>>>(
238
- bin_indices.data_ptr<int8_t>(),
239
- gradients.data_ptr<float>(),
240
- grad_hist.data_ptr<float>(),
241
- hess_hist.data_ptr<float>(),
242
- N, F, num_bins,
243
- rows_per_thread);
244
-
245
- cudaError_t err = cudaGetLastError();
246
- if (err != cudaSuccess)
247
- {
248
- printf("CUDA kernel launch failed: %s\n", cudaGetErrorString(err));
249
- }
250
- }
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes