warpgbm 0.1.16__tar.gz → 0.1.18__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {warpgbm-0.1.16/warpgbm.egg-info → warpgbm-0.1.18}/PKG-INFO +2 -2
  2. {warpgbm-0.1.16 → warpgbm-0.1.18}/README.md +1 -1
  3. {warpgbm-0.1.16 → warpgbm-0.1.18}/pyproject.toml +1 -1
  4. {warpgbm-0.1.16 → warpgbm-0.1.18}/setup.py +1 -0
  5. warpgbm-0.1.18/tests/test_fit_predict_corr.py +46 -0
  6. warpgbm-0.1.18/version.txt +1 -0
  7. {warpgbm-0.1.16 → warpgbm-0.1.18}/warpgbm/core.py +154 -178
  8. warpgbm-0.1.18/warpgbm/cuda/best_split_kernel.cu +79 -0
  9. warpgbm-0.1.18/warpgbm/cuda/binner.cu +52 -0
  10. {warpgbm-0.1.16 → warpgbm-0.1.18}/warpgbm/cuda/node_kernel.cpp +11 -6
  11. {warpgbm-0.1.16 → warpgbm-0.1.18/warpgbm.egg-info}/PKG-INFO +2 -2
  12. {warpgbm-0.1.16 → warpgbm-0.1.18}/warpgbm.egg-info/SOURCES.txt +1 -0
  13. warpgbm-0.1.16/tests/test_fit_predict_corr.py +0 -66
  14. warpgbm-0.1.16/version.txt +0 -1
  15. warpgbm-0.1.16/warpgbm/cuda/best_split_kernel.cu +0 -112
  16. {warpgbm-0.1.16 → warpgbm-0.1.18}/LICENSE +0 -0
  17. {warpgbm-0.1.16 → warpgbm-0.1.18}/MANIFEST.in +0 -0
  18. {warpgbm-0.1.16 → warpgbm-0.1.18}/setup.cfg +0 -0
  19. {warpgbm-0.1.16 → warpgbm-0.1.18}/tests/__init__.py +0 -0
  20. {warpgbm-0.1.16 → warpgbm-0.1.18}/warpgbm/__init__.py +0 -0
  21. {warpgbm-0.1.16 → warpgbm-0.1.18}/warpgbm/cuda/__init__.py +0 -0
  22. {warpgbm-0.1.16 → warpgbm-0.1.18}/warpgbm/cuda/histogram_kernel.cu +0 -0
  23. {warpgbm-0.1.16 → warpgbm-0.1.18}/warpgbm.egg-info/dependency_links.txt +0 -0
  24. {warpgbm-0.1.16 → warpgbm-0.1.18}/warpgbm.egg-info/requires.txt +0 -0
  25. {warpgbm-0.1.16 → warpgbm-0.1.18}/warpgbm.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: warpgbm
3
- Version: 0.1.16
3
+ Version: 0.1.18
4
4
  Summary: A fast GPU-accelerated Gradient Boosted Decision Tree library with PyTorch + CUDA
5
5
  License: GNU GENERAL PUBLIC LICENSE
6
6
  Version 3, 29 June 2007
@@ -706,7 +706,7 @@ WarpGBM is a high-performance, GPU-accelerated Gradient Boosted Decision Tree (G
706
706
 
707
707
  ## Performance Note
708
708
 
709
- In our initial tests on an NVIDIA 3090 (local) and A100 (Google Colab Pro), WarpGBM achieves **14x to 20x faster training times** compared to LightGBM using default configurations. It also consumes **significantly less RAM and CPU**. These early results hint at more thorough benchmarking to come.
709
+ In our initial tests on an NVIDIA 3090 (local) and A100 (Google Colab Pro), WarpGBM achieves **14x to 20x faster training times** compared to LightGBM's CPU version and **2x faster** on the GPU version using default configurations. Speed also outperforms XGBoost and CatBoost on regression problems. It also consumes **significantly less RAM and CPU**. These early results hint at more thorough benchmarking to come.
710
710
 
711
711
  ---
712
712
 
@@ -18,7 +18,7 @@ WarpGBM is a high-performance, GPU-accelerated Gradient Boosted Decision Tree (G
18
18
 
19
19
  ## Performance Note
20
20
 
21
- In our initial tests on an NVIDIA 3090 (local) and A100 (Google Colab Pro), WarpGBM achieves **14x to 20x faster training times** compared to LightGBM using default configurations. It also consumes **significantly less RAM and CPU**. These early results hint at more thorough benchmarking to come.
21
+ In our initial tests on an NVIDIA 3090 (local) and A100 (Google Colab Pro), WarpGBM achieves **14x to 20x faster training times** compared to LightGBM's CPU version and **2x faster** on the GPU version using default configurations. Speed also outperforms XGBoost and CatBoost on regression problems. It also consumes **significantly less RAM and CPU**. These early results hint at more thorough benchmarking to come.
22
22
 
23
23
  ---
24
24
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "warpgbm"
7
- version = "0.1.16"
7
+ version = "0.1.18"
8
8
  description = "A fast GPU-accelerated Gradient Boosted Decision Tree library with PyTorch + CUDA"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -22,6 +22,7 @@ def get_extensions():
22
22
  sources=[
23
23
  "warpgbm/cuda/histogram_kernel.cu",
24
24
  "warpgbm/cuda/best_split_kernel.cu",
25
+ "warpgbm/cuda/binner.cu",
25
26
  "warpgbm/cuda/node_kernel.cpp",
26
27
  ]
27
28
  )
@@ -0,0 +1,46 @@
1
+ import numpy as np
2
+ from warpgbm import WarpGBM
3
+ from sklearn.datasets import make_regression
4
+
5
+ import numpy as np
6
+ import time
7
+ from warpgbm import WarpGBM
8
+ from sklearn.datasets import make_regression
9
+
10
+ def test_fit_predictpytee_correlation():
11
+ np.random.seed(42)
12
+ N = 100_000
13
+ F = 1000
14
+ X, y = make_regression(n_samples=N, n_features=F, noise=0.1, random_state=42)
15
+ era = np.zeros(N, dtype=np.int32)
16
+ corrs = []
17
+
18
+ for hist_type in ['hist1', 'hist2', 'hist3']:
19
+ print(f"\nTesting histogram method: {hist_type}")
20
+
21
+ model = WarpGBM(
22
+ max_depth=10,
23
+ num_bins=10,
24
+ n_estimators=10,
25
+ learning_rate=1,
26
+ verbosity=False,
27
+ histogram_computer=hist_type,
28
+ threads_per_block=128,
29
+ rows_per_thread=4
30
+ )
31
+
32
+ start_fit = time.time()
33
+ model.fit(X, y, era_id=era)
34
+ fit_time = time.time() - start_fit
35
+ print(f" Fit time: {fit_time:.3f} seconds")
36
+
37
+ start_pred = time.time()
38
+ preds = model.predict(X)
39
+ pred_time = time.time() - start_pred
40
+ print(f" Predict time: {pred_time:.3f} seconds")
41
+
42
+ corr = np.corrcoef(preds, y)[0, 1]
43
+ print(f" Correlation: {corr:.4f}")
44
+ corrs.append(corr)
45
+
46
+ assert (np.array(corrs) > 0.95).all(), f"In-sample correlation too low: {corrs}"
@@ -0,0 +1 @@
1
+ 0.1.18
@@ -12,40 +12,6 @@ histogram_kernels = {
12
12
  'hist3': node_kernel.compute_histogram3
13
13
  }
14
14
 
15
- @torch.jit.script
16
- def jit_find_best_split(
17
- G: Tensor, H: Tensor,
18
- lambda_l2: float,
19
- lambda_l1: float, # unused placeholder for now
20
- min_split_gain: float,
21
- min_child_weight: float
22
- ) -> Tuple[int, int]:
23
- F, B = G.size()
24
- Bm1 = B - 1
25
- eps = 0
26
-
27
- GH = torch.stack([G, H], dim=0).cumsum(dim=2) # [2, F, B]
28
- GL, HL_raw = GH[0, :, :-1], GH[1, :, :-1] # [F, B-1]
29
- GP, HP = GH[0, :, -1:], GH[1, :, -1:] # [F, 1]
30
- H_R_raw = HP - HL_raw
31
-
32
- # Validity mask using raw child hessians
33
- valid = (HL_raw >= min_child_weight) & (H_R_raw >= min_child_weight)
34
-
35
- # Closed-form gain
36
- HL, HP = HL_raw + lambda_l2, HP + lambda_l2
37
- num = (HP * GL - HL * GP).pow(2)
38
- denom = HP * HL * (HP - HL) + eps
39
- gain = torch.where(valid & (num / denom >= min_split_gain), num / denom, torch.full_like(num, -float("inf")))
40
-
41
- gain_flat = gain.view(-1)
42
- best_idx = torch.argmax(gain_flat)
43
-
44
- if gain_flat[best_idx].item() == float('-inf'):
45
- return -1, -1
46
-
47
- return best_idx // Bm1, best_idx % Bm1
48
-
49
15
  class WarpGBM(BaseEstimator, RegressorMixin):
50
16
  def __init__(
51
17
  self,
@@ -80,12 +46,8 @@ class WarpGBM(BaseEstimator, RegressorMixin):
80
46
  self.Y_gpu = None
81
47
  self.num_features = None
82
48
  self.num_samples = None
83
- self.out_feature = torch.zeros(1, device=self.device, dtype=torch.int32)
84
- self.out_bin = torch.zeros(1, device=self.device, dtype=torch.int32)
85
49
  self.min_child_weight = min_child_weight
86
50
  self.min_split_gain = min_split_gain
87
- self.best_gain = torch.tensor([-float('inf')], dtype=torch.float32, device=self.device)
88
- self.best_feature = torch.tensor([-1], dtype=torch.int32, device=self.device)
89
51
  self.best_bin = torch.tensor([-1], dtype=torch.int32, device=self.device)
90
52
  self.compute_histogram = histogram_kernels[histogram_computer]
91
53
  self.threads_per_block = threads_per_block
@@ -102,45 +64,49 @@ class WarpGBM(BaseEstimator, RegressorMixin):
102
64
  self.root_node_indices = torch.arange(self.num_samples, device=self.device)
103
65
  self.base_prediction = self.Y_gpu.mean().item()
104
66
  self.gradients += self.base_prediction
105
- self.split_gains = torch.zeros((self.num_features, self.num_bins - 1), device=self.device)
106
- self.forest = self.grow_forest()
67
+ self.best_gains = torch.zeros(self.num_features, device=self.device)
68
+ self.best_bins = torch.zeros(self.num_features, device=self.device, dtype=torch.int32)
69
+ with torch.no_grad():
70
+ self.forest = self.grow_forest()
107
71
  return self
108
-
109
- def compute_quantile_bins(self, X, num_bins):
110
- quantiles = torch.linspace(0, 1, num_bins + 1)[1:-1] # exclude 0% and 100%
111
- bin_edges = torch.quantile(X, quantiles, dim=0) # shape: [B-1, F]
112
- return bin_edges.T # shape: [F, B-1]
113
72
 
114
73
  def preprocess_gpu_data(self, X_np, Y_np, era_id_np):
115
- self.num_samples, self.num_features = X_np.shape
116
- Y_gpu = torch.from_numpy(Y_np).type(torch.float32).to(self.device)
117
- era_id_gpu = torch.from_numpy(era_id_np).type(torch.int32).to(self.device)
118
- is_integer_type = np.issubdtype(X_np.dtype, np.integer)
119
- if is_integer_type:
120
- max_vals = X_np.max(axis=0)
121
- if np.all(max_vals < self.num_bins):
122
- print("Detected pre-binned integer input — skipping quantile binning.")
123
- bin_indices = torch.from_numpy(X_np).to(self.device).contiguous().to(torch.int8)
124
-
125
- # We'll store None or an empty tensor in self.bin_edges
126
- # to indicate that we skip binning at predict-time
127
- bin_edges = torch.arange(1, self.num_bins, dtype=torch.float32).repeat(self.num_features, 1)
128
- bin_edges = bin_edges.to(self.device)
129
- unique_eras, era_indices = torch.unique(era_id_gpu, return_inverse=True)
130
- return bin_indices, era_indices, bin_edges, unique_eras, Y_gpu
131
- else:
132
- print("Integer input detected, but values exceed num_bins — falling back to quantile binning.")
133
-
134
- print("Performing quantile binning on CPU...")
135
- X_cpu = torch.from_numpy(X_np).type(torch.float32) # CPU tensor
136
- bin_edges_cpu = self.compute_quantile_bins(X_cpu, self.num_bins).type(torch.float32).contiguous()
137
- bin_indices_cpu = torch.empty((self.num_samples, self.num_features), dtype=torch.int8)
138
- for f in range(self.num_features):
139
- bin_indices_cpu[:, f] = torch.bucketize(X_cpu[:, f], bin_edges_cpu[f], right=False).type(torch.int8)
140
- bin_indices = bin_indices_cpu.to(self.device).contiguous()
141
- bin_edges = bin_edges_cpu.to(self.device)
142
- unique_eras, era_indices = torch.unique(era_id_gpu, return_inverse=True)
143
- return bin_indices, era_indices, bin_edges, unique_eras, Y_gpu
74
+ with torch.no_grad():
75
+ self.num_samples, self.num_features = X_np.shape
76
+ Y_gpu = torch.from_numpy(Y_np).type(torch.float32).to(self.device)
77
+ era_id_gpu = torch.from_numpy(era_id_np).type(torch.int32).to(self.device)
78
+ is_integer_type = np.issubdtype(X_np.dtype, np.integer)
79
+ if is_integer_type:
80
+ max_vals = X_np.max(axis=0)
81
+ if np.all(max_vals < self.num_bins):
82
+ print("Detected pre-binned integer input — skipping quantile binning.")
83
+ bin_indices = torch.from_numpy(X_np).to(self.device).contiguous().to(torch.int8)
84
+
85
+ # We'll store None or an empty tensor in self.bin_edges
86
+ # to indicate that we skip binning at predict-time
87
+ bin_edges = torch.arange(1, self.num_bins, dtype=torch.float32).repeat(self.num_features, 1)
88
+ bin_edges = bin_edges.to(self.device)
89
+ unique_eras, era_indices = torch.unique(era_id_gpu, return_inverse=True)
90
+ return bin_indices, era_indices, bin_edges, unique_eras, Y_gpu
91
+ else:
92
+ print("Integer input detected, but values exceed num_bins — falling back to quantile binning.")
93
+
94
+ bin_indices = torch.empty((self.num_samples, self.num_features), dtype=torch.int8, device='cuda')
95
+ bin_edges = torch.empty((self.num_features, self.num_bins - 1), dtype=torch.float32, device='cuda')
96
+
97
+ X_np = torch.from_numpy(X_np).to(torch.float32).pin_memory()
98
+
99
+ for f in range(self.num_features):
100
+ X_f = X_np[:, f].to('cuda', non_blocking=True)
101
+ quantiles = torch.linspace(0, 1, self.num_bins + 1, device='cuda', dtype=X_f.dtype)[1:-1]
102
+ bin_edges_f = torch.quantile(X_f, quantiles, dim=0).contiguous() # shape: [B-1] for 1D input
103
+ bin_indices_f = bin_indices[:, f].contiguous() # view into output
104
+ node_kernel.custom_cuda_binner(X_f, bin_edges_f, bin_indices_f)
105
+ bin_indices[:,f] = bin_indices_f
106
+ bin_edges[f,:] = bin_edges_f
107
+
108
+ unique_eras, era_indices = torch.unique(era_id_gpu, return_inverse=True)
109
+ return bin_indices, era_indices, bin_edges, unique_eras, Y_gpu
144
110
 
145
111
  def compute_histograms(self, bin_indices_sub, gradients):
146
112
  grad_hist = torch.zeros((self.num_features, self.num_bins), device=self.device, dtype=torch.float32)
@@ -158,15 +124,24 @@ class WarpGBM(BaseEstimator, RegressorMixin):
158
124
  return grad_hist, hess_hist
159
125
 
160
126
  def find_best_split(self, gradient_histogram, hessian_histogram):
161
- f,b = jit_find_best_split(
127
+ node_kernel.compute_split(
162
128
  gradient_histogram,
163
129
  hessian_histogram,
164
- self.L2_reg,
165
- self.L1_reg,
166
130
  self.min_split_gain,
167
131
  self.min_child_weight,
132
+ self.L2_reg,
133
+ self.best_gains,
134
+ self.best_bins,
135
+ self.threads_per_block
168
136
  )
169
- return (f, b)
137
+
138
+ if torch.all(self.best_bins == -1):
139
+ return -1, -1 # No valid split found
140
+
141
+ f = torch.argmax(self.best_gains).item()
142
+ b = self.best_bins[f].item()
143
+
144
+ return f, b
170
145
 
171
146
  def grow_tree(self, gradient_histogram, hessian_histogram, node_indices, depth):
172
147
  if depth == self.max_depth:
@@ -226,10 +201,10 @@ class WarpGBM(BaseEstimator, RegressorMixin):
226
201
  depth=0
227
202
  )
228
203
  forest[i] = tree
229
- # loss = ((self.Y_gpu - self.gradients) ** 2).mean().item()
230
- # self.training_loss.append(loss)
231
- # print(f"🌲 Tree {i+1}/{self.n_estimators} - MSE: {loss:.6f}")
232
-
204
+ # loss = ((self.Y_gpu - self.gradients) ** 2).mean().item()
205
+ # self.training_loss.append(loss)
206
+ # print(f"🌲 Tree {i+1}/{self.n_estimators} - MSE: {loss:.6f}")
207
+
233
208
  print("Finished training forest.")
234
209
  return forest
235
210
 
@@ -239,103 +214,104 @@ class WarpGBM(BaseEstimator, RegressorMixin):
239
214
  We assume `flatten_forest_to_tensors` has produced self.flat_forest with
240
215
  "features", "thresholds", "leaf_values", all shaped [n_trees, max_nodes].
241
216
  """
242
- # 1) Convert X_np -> bin_indices
243
- is_integer_type = np.issubdtype(X_np.dtype, np.integer)
244
- if is_integer_type:
245
- max_vals = X_np.max(axis=0)
246
- if np.all(max_vals < self.num_bins):
247
- bin_indices = X_np.astype(np.int8)
217
+ with torch.no_grad():
218
+ # 1) Convert X_np -> bin_indices
219
+ is_integer_type = np.issubdtype(X_np.dtype, np.integer)
220
+ if is_integer_type:
221
+ max_vals = X_np.max(axis=0)
222
+ if np.all(max_vals < self.num_bins):
223
+ bin_indices = X_np.astype(np.int8)
224
+ else:
225
+ raise ValueError("Pre-binned integers must be < num_bins")
248
226
  else:
249
- raise ValueError("Pre-binned integers must be < num_bins")
250
- else:
251
- X_cpu = torch.from_numpy(X_np).type(torch.float32)
252
- bin_indices = torch.empty((X_np.shape[0], X_np.shape[1]), dtype=torch.int8)
253
- bin_edges_cpu = self.bin_edges.to('cpu')
254
- for f in range(self.num_features):
255
- bin_indices[:, f] = torch.bucketize(X_cpu[:, f], bin_edges_cpu[f], right=False).type(torch.int8)
256
- bin_indices = bin_indices.numpy()
257
-
258
- # 2) Ensure we have a padded representation
259
- self.flat_forest = self.flatten_forest_to_tensors(self.forest)
260
-
261
- features_t = self.flat_forest["features"] # [n_trees, max_nodes], int16
262
- thresholds_t = self.flat_forest["thresholds"] # [n_trees, max_nodes], int16
263
- values_t = self.flat_forest["leaf_values"] # [n_trees, max_nodes], float32
264
- max_nodes = self.flat_forest["max_nodes"]
265
-
266
- n_trees = features_t.shape[0]
267
- N = bin_indices.shape[0]
268
- out = np.zeros(N, dtype=np.float32)
269
-
270
- # 3) Process rows in chunks
271
- for start in tqdm(range(0, N, chunk_size)):
272
- end = min(start + chunk_size, N)
273
- chunk_np = bin_indices[start:end] # shape [chunk_size, F]
274
- chunk_gpu = torch.from_numpy(chunk_np).to(self.device) # [chunk_size, F], int8
275
-
276
- # Accumulate raw (unscaled) leaf sums
277
- chunk_preds = torch.zeros((end - start,), dtype=torch.float32, device=self.device)
278
-
279
- # node_idx[i] tracks the current node index in the padded tree for row i
280
- node_idx = torch.zeros((end - start,), dtype=torch.int32, device=self.device)
281
-
282
- # 'active' is a boolean mask over [0..(end-start-1)], indicating which rows haven't reached a leaf
283
- active = torch.ones((end - start,), dtype=torch.bool, device=self.device)
284
-
285
- for t in range(n_trees):
286
- # Reset for each tree (each tree is independent)
287
- node_idx.fill_(0)
288
- active.fill_(True)
289
-
290
- tree_features = features_t[t] # shape [max_nodes], int16
291
- tree_thresh = thresholds_t[t] # shape [max_nodes], int16
292
- tree_values = values_t[t] # shape [max_nodes], float32
293
-
294
- # Up to self.max_depth+1 layers
295
- for _level in range(self.max_depth + 1):
296
- active_idx = active.nonzero(as_tuple=True)[0]
297
- if active_idx.numel() == 0:
298
- break # all rows are done in this tree
299
-
300
- current_node_idx = node_idx[active_idx]
301
- f = tree_features[current_node_idx] # shape [#active], int16
302
- thr = tree_thresh[current_node_idx] # shape [#active], int16
303
- vals = tree_values[current_node_idx] # shape [#active], float32
304
-
305
- mask_no_node = (f == -2)
306
- mask_leaf = (f == -1)
307
-
308
- # If leaf, add leaf value and mark inactive.
309
- if mask_leaf.any():
310
- leaf_rows = active_idx[mask_leaf]
311
- chunk_preds[leaf_rows] += vals[mask_leaf]
312
- active[leaf_rows] = False
313
-
314
- # If no node, mark inactive.
315
- if mask_no_node.any():
316
- no_node_rows = active_idx[mask_no_node]
317
- active[no_node_rows] = False
318
-
319
- # For internal nodes, perform bin comparison.
320
- mask_internal = (~mask_leaf & ~mask_no_node)
321
- if mask_internal.any():
322
- internal_rows = active_idx[mask_internal]
323
- act_f = f[mask_internal].long()
324
- act_thr = thr[mask_internal]
325
- binvals = chunk_gpu[internal_rows, act_f]
326
- go_left = (binvals <= act_thr)
327
- new_left_idx = current_node_idx[mask_internal] * 2 + 1
328
- new_right_idx = current_node_idx[mask_internal] * 2 + 2
329
- node_idx[internal_rows[go_left]] = new_left_idx[go_left]
330
- node_idx[internal_rows[~go_left]] = new_right_idx[~go_left]
331
- # end per-tree layer loop
332
- # end for each tree
333
-
334
- out[start:end] = (
335
- self.base_prediction + self.learning_rate * chunk_preds
336
- ).cpu().numpy()
337
-
338
- return out
227
+ X_cpu = torch.from_numpy(X_np).type(torch.float32)
228
+ bin_indices = torch.empty((X_np.shape[0], X_np.shape[1]), dtype=torch.int8)
229
+ bin_edges_cpu = self.bin_edges.to('cpu')
230
+ for f in range(self.num_features):
231
+ bin_indices[:, f] = torch.bucketize(X_cpu[:, f], bin_edges_cpu[f], right=False).type(torch.int8)
232
+ bin_indices = bin_indices.numpy()
233
+
234
+ # 2) Ensure we have a padded representation
235
+ self.flat_forest = self.flatten_forest_to_tensors(self.forest)
236
+
237
+ features_t = self.flat_forest["features"] # [n_trees, max_nodes], int16
238
+ thresholds_t = self.flat_forest["thresholds"] # [n_trees, max_nodes], int16
239
+ values_t = self.flat_forest["leaf_values"] # [n_trees, max_nodes], float32
240
+ max_nodes = self.flat_forest["max_nodes"]
241
+
242
+ n_trees = features_t.shape[0]
243
+ N = bin_indices.shape[0]
244
+ out = np.zeros(N, dtype=np.float32)
245
+
246
+ # 3) Process rows in chunks
247
+ for start in tqdm(range(0, N, chunk_size)):
248
+ end = min(start + chunk_size, N)
249
+ chunk_np = bin_indices[start:end] # shape [chunk_size, F]
250
+ chunk_gpu = torch.from_numpy(chunk_np).to(self.device) # [chunk_size, F], int8
251
+
252
+ # Accumulate raw (unscaled) leaf sums
253
+ chunk_preds = torch.zeros((end - start,), dtype=torch.float32, device=self.device)
254
+
255
+ # node_idx[i] tracks the current node index in the padded tree for row i
256
+ node_idx = torch.zeros((end - start,), dtype=torch.int32, device=self.device)
257
+
258
+ # 'active' is a boolean mask over [0..(end-start-1)], indicating which rows haven't reached a leaf
259
+ active = torch.ones((end - start,), dtype=torch.bool, device=self.device)
260
+
261
+ for t in range(n_trees):
262
+ # Reset for each tree (each tree is independent)
263
+ node_idx.fill_(0)
264
+ active.fill_(True)
265
+
266
+ tree_features = features_t[t] # shape [max_nodes], int16
267
+ tree_thresh = thresholds_t[t] # shape [max_nodes], int16
268
+ tree_values = values_t[t] # shape [max_nodes], float32
269
+
270
+ # Up to self.max_depth+1 layers
271
+ for _level in range(self.max_depth + 1):
272
+ active_idx = active.nonzero(as_tuple=True)[0]
273
+ if active_idx.numel() == 0:
274
+ break # all rows are done in this tree
275
+
276
+ current_node_idx = node_idx[active_idx]
277
+ f = tree_features[current_node_idx] # shape [#active], int16
278
+ thr = tree_thresh[current_node_idx] # shape [#active], int16
279
+ vals = tree_values[current_node_idx] # shape [#active], float32
280
+
281
+ mask_no_node = (f == -2)
282
+ mask_leaf = (f == -1)
283
+
284
+ # If leaf, add leaf value and mark inactive.
285
+ if mask_leaf.any():
286
+ leaf_rows = active_idx[mask_leaf]
287
+ chunk_preds[leaf_rows] += vals[mask_leaf]
288
+ active[leaf_rows] = False
289
+
290
+ # If no node, mark inactive.
291
+ if mask_no_node.any():
292
+ no_node_rows = active_idx[mask_no_node]
293
+ active[no_node_rows] = False
294
+
295
+ # For internal nodes, perform bin comparison.
296
+ mask_internal = (~mask_leaf & ~mask_no_node)
297
+ if mask_internal.any():
298
+ internal_rows = active_idx[mask_internal]
299
+ act_f = f[mask_internal].long()
300
+ act_thr = thr[mask_internal]
301
+ binvals = chunk_gpu[internal_rows, act_f]
302
+ go_left = (binvals <= act_thr)
303
+ new_left_idx = current_node_idx[mask_internal] * 2 + 1
304
+ new_right_idx = current_node_idx[mask_internal] * 2 + 2
305
+ node_idx[internal_rows[go_left]] = new_left_idx[go_left]
306
+ node_idx[internal_rows[~go_left]] = new_right_idx[~go_left]
307
+ # end per-tree layer loop
308
+ # end for each tree
309
+
310
+ out[start:end] = (
311
+ self.base_prediction + self.learning_rate * chunk_preds
312
+ ).cpu().numpy()
313
+
314
+ return out
339
315
 
340
316
  def flatten_forest_to_tensors(self, forest):
341
317
  """
@@ -0,0 +1,79 @@
1
+ #include <torch/extension.h>
2
+ #include <cuda.h>
3
+ #include <cuda_runtime.h>
4
+
5
+ __global__ void best_split_kernel_global_only(
6
+ const float *__restrict__ G, // [F x B]
7
+ const float *__restrict__ H, // [F x B]
8
+ int F,
9
+ int B,
10
+ float min_split_gain,
11
+ float min_child_samples,
12
+ float eps,
13
+ float *__restrict__ best_gains, // [F]
14
+ int *__restrict__ best_bins // [F]
15
+ )
16
+ {
17
+ int f = blockIdx.x * blockDim.x + threadIdx.x;
18
+ if (f >= F)
19
+ return;
20
+
21
+ float G_total = 0.0f, H_total = 0.0f;
22
+ for (int b = 0; b < B; ++b)
23
+ {
24
+ G_total += G[f * B + b];
25
+ H_total += H[f * B + b];
26
+ }
27
+
28
+ float G_L = 0.0f, H_L = 0.0f;
29
+ float best_gain = min_split_gain;
30
+ int best_bin = -1;
31
+
32
+ for (int b = 0; b < B - 1; ++b)
33
+ {
34
+ G_L += G[f * B + b];
35
+ H_L += H[f * B + b];
36
+ float G_R = G_total - G_L;
37
+ float H_R = H_total - H_L;
38
+
39
+ if (H_L >= min_child_samples && H_R >= min_child_samples)
40
+ {
41
+ float gain = (G_L * G_L) / (H_L + eps) + (G_R * G_R) / (H_R + eps);
42
+ if (gain > best_gain)
43
+ {
44
+ best_gain = gain;
45
+ best_bin = b;
46
+ }
47
+ }
48
+ }
49
+
50
+ best_gains[f] = best_gain;
51
+ best_bins[f] = best_bin;
52
+ }
53
+
54
+ void launch_best_split_kernel_cuda(
55
+ const at::Tensor &G, // [F x B]
56
+ const at::Tensor &H, // [F x B]
57
+ float min_split_gain,
58
+ float min_child_samples,
59
+ float eps,
60
+ at::Tensor &best_gains, // [F], float32
61
+ at::Tensor &best_bins, // [F], int32
62
+ int threads)
63
+ {
64
+ int F = G.size(0);
65
+ int B = G.size(1);
66
+
67
+ int blocks = (F + threads - 1) / threads;
68
+
69
+ best_split_kernel_global_only<<<blocks, threads>>>(
70
+ G.data_ptr<float>(),
71
+ H.data_ptr<float>(),
72
+ F,
73
+ B,
74
+ min_split_gain,
75
+ min_child_samples,
76
+ eps,
77
+ best_gains.data_ptr<float>(),
78
+ best_bins.data_ptr<int>());
79
+ }
@@ -0,0 +1,52 @@
1
+ #include <torch/extension.h>
2
+ #include <cuda.h>
3
+ #include <cuda_runtime.h>
4
+
5
+ __global__ void bin_column_kernel(
6
+ const float *__restrict__ X, // [N]
7
+ const float *__restrict__ bin_edges, // [B - 1]
8
+ int8_t *__restrict__ bin_indices, // [N]
9
+ int N,
10
+ int B_minus1)
11
+ {
12
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
13
+ if (idx >= N)
14
+ return;
15
+
16
+ float val = X[idx];
17
+ int bin = 0;
18
+
19
+ // Linear scan over edges: bin_edges is sorted
20
+ while (bin < B_minus1 && val >= bin_edges[bin])
21
+ {
22
+ ++bin;
23
+ }
24
+
25
+ bin_indices[idx] = static_cast<int8_t>(bin);
26
+ }
27
+
28
+ // C++ launcher for calling from Python
29
+ void launch_bin_column_kernel(
30
+ at::Tensor X, // [N]
31
+ at::Tensor bin_edges, // [B - 1]
32
+ at::Tensor bin_indices // [N]
33
+ )
34
+ {
35
+ const int N = X.size(0);
36
+ const int B = bin_edges.size(0);
37
+
38
+ const int threads = 256;
39
+ const int blocks = (N + threads - 1) / threads;
40
+
41
+ bin_column_kernel<<<blocks, threads>>>(
42
+ X.data_ptr<float>(),
43
+ bin_edges.data_ptr<float>(),
44
+ bin_indices.data_ptr<int8_t>(),
45
+ N,
46
+ B);
47
+
48
+ // Optional: sync and error check
49
+ cudaError_t err = cudaGetLastError();
50
+ if (err != cudaSuccess)
51
+ printf("CUDA error: %s\n", cudaGetErrorString(err));
52
+ }
@@ -21,15 +21,14 @@ void launch_histogram_kernel_cuda_2(
21
21
  int rows_per_thread = 1);
22
22
 
23
23
  void launch_best_split_kernel_cuda(
24
- const at::Tensor &G,
25
- const at::Tensor &H,
26
- int F,
27
- int B,
24
+ const at::Tensor &G, // [F x B]
25
+ const at::Tensor &H, // [F x B]
28
26
  float min_split_gain,
29
27
  float min_child_samples,
30
28
  float eps,
31
- at::Tensor &out_feature,
32
- at::Tensor &out_bin);
29
+ at::Tensor &best_gains, // [F], float32
30
+ at::Tensor &best_bins,
31
+ int threads);
33
32
 
34
33
  void launch_histogram_kernel_cuda_configurable(
35
34
  const at::Tensor &bin_indices,
@@ -40,6 +39,11 @@ void launch_histogram_kernel_cuda_configurable(
40
39
  int threads_per_block = 256,
41
40
  int rows_per_thread = 1);
42
41
 
42
+ void launch_bin_column_kernel(
43
+ at::Tensor X,
44
+ at::Tensor bin_edges,
45
+ at::Tensor bin_indices);
46
+
43
47
  // Bindings
44
48
  PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
45
49
  {
@@ -47,4 +51,5 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
47
51
  m.def("compute_histogram2", &launch_histogram_kernel_cuda_2, "Histogram (CUDA) 2");
48
52
  m.def("compute_histogram3", &launch_histogram_kernel_cuda_configurable, "Histogram Feature Shared Mem");
49
53
  m.def("compute_split", &launch_best_split_kernel_cuda, "Best Split (CUDA)");
54
+ m.def("custom_cuda_binner", &launch_bin_column_kernel, "Custom CUDA binning kernel");
50
55
  }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: warpgbm
3
- Version: 0.1.16
3
+ Version: 0.1.18
4
4
  Summary: A fast GPU-accelerated Gradient Boosted Decision Tree library with PyTorch + CUDA
5
5
  License: GNU GENERAL PUBLIC LICENSE
6
6
  Version 3, 29 June 2007
@@ -706,7 +706,7 @@ WarpGBM is a high-performance, GPU-accelerated Gradient Boosted Decision Tree (G
706
706
 
707
707
  ## Performance Note
708
708
 
709
- In our initial tests on an NVIDIA 3090 (local) and A100 (Google Colab Pro), WarpGBM achieves **14x to 20x faster training times** compared to LightGBM using default configurations. It also consumes **significantly less RAM and CPU**. These early results hint at more thorough benchmarking to come.
709
+ In our initial tests on an NVIDIA 3090 (local) and A100 (Google Colab Pro), WarpGBM achieves **14x to 20x faster training times** compared to LightGBM's CPU version and **2x faster** on the GPU version using default configurations. Speed also outperforms XGBoost and CatBoost on regression problems. It also consumes **significantly less RAM and CPU**. These early results hint at more thorough benchmarking to come.
710
710
 
711
711
  ---
712
712
 
@@ -15,5 +15,6 @@ warpgbm.egg-info/requires.txt
15
15
  warpgbm.egg-info/top_level.txt
16
16
  warpgbm/cuda/__init__.py
17
17
  warpgbm/cuda/best_split_kernel.cu
18
+ warpgbm/cuda/binner.cu
18
19
  warpgbm/cuda/histogram_kernel.cu
19
20
  warpgbm/cuda/node_kernel.cpp
@@ -1,66 +0,0 @@
1
- import numpy as np
2
- from warpgbm import WarpGBM
3
- from sklearn.datasets import make_regression
4
-
5
- def test_fit_predict_correlation():
6
- np.random.seed(42)
7
- N = 1_000_000
8
- F = 100
9
- X, y = make_regression(n_samples=N, n_features=F, noise=0.1, random_state=42)
10
- era = np.zeros(N, dtype=np.int32)
11
- corrs = []
12
-
13
- model = WarpGBM(
14
- max_depth = 10,
15
- num_bins = 10,
16
- n_estimators = 10,
17
- learning_rate = 1,
18
- verbosity=False,
19
- histogram_computer='hist1',
20
- threads_per_block=32,
21
- rows_per_thread=4
22
- )
23
-
24
- model.fit(X, y, era_id=era)
25
- preds = model.predict(X)
26
-
27
- # Pearson correlation in-sample
28
- corr = np.corrcoef(preds, y)[0, 1]
29
- corrs.append(corr)
30
-
31
- model = WarpGBM(
32
- max_depth = 10,
33
- num_bins = 10,
34
- n_estimators = 10,
35
- learning_rate = 1,
36
- verbosity=False,
37
- histogram_computer='hist2',
38
- threads_per_block=32,
39
- rows_per_thread=4
40
- )
41
-
42
- model.fit(X, y, era_id=era)
43
- preds = model.predict(X)
44
-
45
- # Pearson correlation in-sample
46
- corr = np.corrcoef(preds, y)[0, 1]
47
- corrs.append(corr)
48
-
49
- model = WarpGBM(
50
- max_depth = 10,
51
- num_bins = 10,
52
- n_estimators = 10,
53
- learning_rate = 1,
54
- verbosity=False,
55
- histogram_computer='hist3',
56
- threads_per_block=32,
57
- rows_per_thread=4
58
- )
59
-
60
- model.fit(X, y, era_id=era)
61
- preds = model.predict(X)
62
-
63
- # Pearson correlation in-sample
64
- corr = np.corrcoef(preds, y)[0, 1]
65
- corrs.append(corr)
66
- assert ( np.array(corrs) > 0.95 ).all(), f"In-sample correlation too low: {corr:.4f}"
@@ -1 +0,0 @@
1
- 0.1.16
@@ -1,112 +0,0 @@
1
- #include <torch/extension.h>
2
- #include <cuda.h>
3
- #include <cuda_runtime.h>
4
-
5
- __global__ void best_split_kernel(
6
- const float *__restrict__ G, // [F x B]
7
- const float *__restrict__ H, // [F x B]
8
- int F,
9
- int B,
10
- float min_split_gain,
11
- float min_child_samples,
12
- float eps,
13
- int *out_feature,
14
- int *out_bin,
15
- void *shared_mem)
16
- {
17
- int f = blockIdx.x * blockDim.x + threadIdx.x;
18
- if (f >= F)
19
- return;
20
-
21
- // Cast shared memory
22
- extern __shared__ char smem[];
23
- float *gains = reinterpret_cast<float *>(smem);
24
- int *features = reinterpret_cast<int *>(&gains[blockDim.x]);
25
- int *bins = reinterpret_cast<int *>(&features[blockDim.x]);
26
-
27
- // Calculate total G and H for this feature
28
- float G_total = 0.0f, H_total = 0.0f;
29
- for (int b = 0; b < B; ++b)
30
- {
31
- G_total += G[f * B + b];
32
- H_total += H[f * B + b];
33
- }
34
-
35
- float G_L = 0.0f, H_L = 0.0f;
36
- float best_gain = min_split_gain;
37
- int best_bin = -1;
38
-
39
- for (int b = 0; b < B - 1; ++b)
40
- {
41
- G_L += G[f * B + b];
42
- H_L += H[f * B + b];
43
- float G_R = G_total - G_L;
44
- float H_R = H_total - H_L;
45
-
46
- if (H_L > min_child_samples && H_R > min_child_samples)
47
- {
48
- float gain = (G_L * G_L) / (H_L + eps) + (G_R * G_R) / (H_R + eps);
49
- if (gain > best_gain)
50
- {
51
- best_gain = gain;
52
- best_bin = b;
53
- }
54
- }
55
- }
56
-
57
- gains[threadIdx.x] = best_gain;
58
- features[threadIdx.x] = f;
59
- bins[threadIdx.x] = best_bin;
60
- __syncthreads();
61
-
62
- // Thread 0 in each block finds best among its block
63
- if (threadIdx.x == 0)
64
- {
65
- float block_best_gain = min_split_gain;
66
- int block_best_feature = -1;
67
- int block_best_bin = -1;
68
- for (int i = 0; i < blockDim.x && blockIdx.x * blockDim.x + i < F; ++i)
69
- {
70
- if (gains[i] > block_best_gain)
71
- {
72
- block_best_gain = gains[i];
73
- block_best_feature = features[i];
74
- block_best_bin = bins[i];
75
- }
76
- }
77
-
78
- // Write to global outputs
79
- *out_feature = block_best_feature;
80
- *out_bin = block_best_bin;
81
- }
82
- }
83
-
84
- void launch_best_split_kernel_cuda(
85
- const at::Tensor &G,
86
- const at::Tensor &H,
87
- int F,
88
- int B,
89
- float min_split_gain,
90
- float min_child_samples,
91
- float eps,
92
- at::Tensor &out_feature,
93
- at::Tensor &out_bin)
94
- {
95
- int threads = 256;
96
- int blocks = (F + threads - 1) / threads;
97
-
98
- size_t shared_mem_bytes = threads * (sizeof(float) + 2 * sizeof(int));
99
-
100
- best_split_kernel<<<blocks, threads, shared_mem_bytes>>>(
101
- G.data_ptr<float>(),
102
- H.data_ptr<float>(),
103
- F,
104
- B,
105
- min_split_gain,
106
- min_child_samples,
107
- eps,
108
- out_feature.data_ptr<int>(),
109
- out_bin.data_ptr<int>(),
110
- nullptr // shared memory pointer not needed; just launch size
111
- );
112
- }
File without changes
File without changes
File without changes
File without changes
File without changes