warpgbm 0.1.17__tar.gz → 0.1.18__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {warpgbm-0.1.17/warpgbm.egg-info → warpgbm-0.1.18}/PKG-INFO +2 -2
  2. {warpgbm-0.1.17 → warpgbm-0.1.18}/README.md +1 -1
  3. {warpgbm-0.1.17 → warpgbm-0.1.18}/pyproject.toml +1 -1
  4. warpgbm-0.1.18/tests/test_fit_predict_corr.py +46 -0
  5. warpgbm-0.1.18/version.txt +1 -0
  6. {warpgbm-0.1.17 → warpgbm-0.1.18}/warpgbm/core.py +36 -60
  7. warpgbm-0.1.18/warpgbm/cuda/best_split_kernel.cu +79 -0
  8. {warpgbm-0.1.17 → warpgbm-0.1.18}/warpgbm/cuda/node_kernel.cpp +5 -6
  9. {warpgbm-0.1.17 → warpgbm-0.1.18/warpgbm.egg-info}/PKG-INFO +2 -2
  10. warpgbm-0.1.17/tests/test_fit_predict_corr.py +0 -66
  11. warpgbm-0.1.17/version.txt +0 -1
  12. warpgbm-0.1.17/warpgbm/cuda/best_split_kernel.cu +0 -112
  13. {warpgbm-0.1.17 → warpgbm-0.1.18}/LICENSE +0 -0
  14. {warpgbm-0.1.17 → warpgbm-0.1.18}/MANIFEST.in +0 -0
  15. {warpgbm-0.1.17 → warpgbm-0.1.18}/setup.cfg +0 -0
  16. {warpgbm-0.1.17 → warpgbm-0.1.18}/setup.py +0 -0
  17. {warpgbm-0.1.17 → warpgbm-0.1.18}/tests/__init__.py +0 -0
  18. {warpgbm-0.1.17 → warpgbm-0.1.18}/warpgbm/__init__.py +0 -0
  19. {warpgbm-0.1.17 → warpgbm-0.1.18}/warpgbm/cuda/__init__.py +0 -0
  20. {warpgbm-0.1.17 → warpgbm-0.1.18}/warpgbm/cuda/binner.cu +0 -0
  21. {warpgbm-0.1.17 → warpgbm-0.1.18}/warpgbm/cuda/histogram_kernel.cu +0 -0
  22. {warpgbm-0.1.17 → warpgbm-0.1.18}/warpgbm.egg-info/SOURCES.txt +0 -0
  23. {warpgbm-0.1.17 → warpgbm-0.1.18}/warpgbm.egg-info/dependency_links.txt +0 -0
  24. {warpgbm-0.1.17 → warpgbm-0.1.18}/warpgbm.egg-info/requires.txt +0 -0
  25. {warpgbm-0.1.17 → warpgbm-0.1.18}/warpgbm.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: warpgbm
3
- Version: 0.1.17
3
+ Version: 0.1.18
4
4
  Summary: A fast GPU-accelerated Gradient Boosted Decision Tree library with PyTorch + CUDA
5
5
  License: GNU GENERAL PUBLIC LICENSE
6
6
  Version 3, 29 June 2007
@@ -706,7 +706,7 @@ WarpGBM is a high-performance, GPU-accelerated Gradient Boosted Decision Tree (G
706
706
 
707
707
  ## Performance Note
708
708
 
709
- In our initial tests on an NVIDIA 3090 (local) and A100 (Google Colab Pro), WarpGBM achieves **14x to 20x faster training times** compared to LightGBM using default configurations. It also consumes **significantly less RAM and CPU**. These early results hint at more thorough benchmarking to come.
709
+ In our initial tests on an NVIDIA 3090 (local) and A100 (Google Colab Pro), WarpGBM achieves **14x to 20x faster training times** compared to LightGBM's CPU version and **2x faster** on the GPU version using default configurations. Speed also outperforms XGBoost and CatBoost on regression problems. It also consumes **significantly less RAM and CPU**. These early results hint at more thorough benchmarking to come.
710
710
 
711
711
  ---
712
712
 
@@ -18,7 +18,7 @@ WarpGBM is a high-performance, GPU-accelerated Gradient Boosted Decision Tree (G
18
18
 
19
19
  ## Performance Note
20
20
 
21
- In our initial tests on an NVIDIA 3090 (local) and A100 (Google Colab Pro), WarpGBM achieves **14x to 20x faster training times** compared to LightGBM using default configurations. It also consumes **significantly less RAM and CPU**. These early results hint at more thorough benchmarking to come.
21
+ In our initial tests on an NVIDIA 3090 (local) and A100 (Google Colab Pro), WarpGBM achieves **14x to 20x faster training times** compared to LightGBM's CPU version and **2x faster** on the GPU version using default configurations. Speed also outperforms XGBoost and CatBoost on regression problems. It also consumes **significantly less RAM and CPU**. These early results hint at more thorough benchmarking to come.
22
22
 
23
23
  ---
24
24
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "warpgbm"
7
- version = "0.1.17"
7
+ version = "0.1.18"
8
8
  description = "A fast GPU-accelerated Gradient Boosted Decision Tree library with PyTorch + CUDA"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -0,0 +1,46 @@
1
+ import numpy as np
2
+ from warpgbm import WarpGBM
3
+ from sklearn.datasets import make_regression
4
+
5
+ import numpy as np
6
+ import time
7
+ from warpgbm import WarpGBM
8
+ from sklearn.datasets import make_regression
9
+
10
+ def test_fit_predictpytee_correlation():
11
+ np.random.seed(42)
12
+ N = 100_000
13
+ F = 1000
14
+ X, y = make_regression(n_samples=N, n_features=F, noise=0.1, random_state=42)
15
+ era = np.zeros(N, dtype=np.int32)
16
+ corrs = []
17
+
18
+ for hist_type in ['hist1', 'hist2', 'hist3']:
19
+ print(f"\nTesting histogram method: {hist_type}")
20
+
21
+ model = WarpGBM(
22
+ max_depth=10,
23
+ num_bins=10,
24
+ n_estimators=10,
25
+ learning_rate=1,
26
+ verbosity=False,
27
+ histogram_computer=hist_type,
28
+ threads_per_block=128,
29
+ rows_per_thread=4
30
+ )
31
+
32
+ start_fit = time.time()
33
+ model.fit(X, y, era_id=era)
34
+ fit_time = time.time() - start_fit
35
+ print(f" Fit time: {fit_time:.3f} seconds")
36
+
37
+ start_pred = time.time()
38
+ preds = model.predict(X)
39
+ pred_time = time.time() - start_pred
40
+ print(f" Predict time: {pred_time:.3f} seconds")
41
+
42
+ corr = np.corrcoef(preds, y)[0, 1]
43
+ print(f" Correlation: {corr:.4f}")
44
+ corrs.append(corr)
45
+
46
+ assert (np.array(corrs) > 0.95).all(), f"In-sample correlation too low: {corrs}"
@@ -0,0 +1 @@
1
+ 0.1.18
@@ -12,36 +12,6 @@ histogram_kernels = {
12
12
  'hist3': node_kernel.compute_histogram3
13
13
  }
14
14
 
15
- @torch.jit.script
16
- def jit_find_best_split(
17
- G: Tensor, H: Tensor,
18
- lambda_l2: float,
19
- lambda_l1: float, # unused placeholder for now
20
- min_split_gain: float,
21
- min_child_weight: float
22
- ) -> Tuple[int, int]:
23
- F, B = G.size()
24
- Bm1 = B - 1
25
-
26
- GH = torch.stack([G, H], dim=0).cumsum(dim=2) # [2, F, B]
27
- GL, HL = GH[0, :, :-1], GH[1, :, :-1] # [F, B-1]
28
- GP, HP = GH[0, :, -1:], GH[1, :, -1:] # [F, 1]
29
- GR = GP - GL
30
- HR = HP - HL
31
-
32
- # Validity mask using raw child hessians
33
- valid = (HL >= min_child_weight) & (HR >= min_child_weight)
34
- g = (GR**2)/(HR + lambda_l2) + (GL**2)/(HL + lambda_l2) - (GP**2)/(HP + lambda_l2)
35
- gain = torch.where(valid & (g >= min_split_gain), g, -1.0)
36
-
37
- gain_flat = gain.view(-1)
38
- best_idx = torch.argmax(gain_flat)
39
-
40
- if gain_flat[best_idx].item() == float('-inf'):
41
- return -1, -1
42
-
43
- return best_idx // Bm1, best_idx % Bm1
44
-
45
15
  class WarpGBM(BaseEstimator, RegressorMixin):
46
16
  def __init__(
47
17
  self,
@@ -76,12 +46,8 @@ class WarpGBM(BaseEstimator, RegressorMixin):
76
46
  self.Y_gpu = None
77
47
  self.num_features = None
78
48
  self.num_samples = None
79
- self.out_feature = torch.zeros(1, device=self.device, dtype=torch.int32)
80
- self.out_bin = torch.zeros(1, device=self.device, dtype=torch.int32)
81
49
  self.min_child_weight = min_child_weight
82
50
  self.min_split_gain = min_split_gain
83
- self.best_gain = torch.tensor([-float('inf')], dtype=torch.float32, device=self.device)
84
- self.best_feature = torch.tensor([-1], dtype=torch.int32, device=self.device)
85
51
  self.best_bin = torch.tensor([-1], dtype=torch.int32, device=self.device)
86
52
  self.compute_histogram = histogram_kernels[histogram_computer]
87
53
  self.threads_per_block = threads_per_block
@@ -98,8 +64,10 @@ class WarpGBM(BaseEstimator, RegressorMixin):
98
64
  self.root_node_indices = torch.arange(self.num_samples, device=self.device)
99
65
  self.base_prediction = self.Y_gpu.mean().item()
100
66
  self.gradients += self.base_prediction
101
- self.split_gains = torch.zeros((self.num_features, self.num_bins - 1), device=self.device)
102
- self.forest = self.grow_forest()
67
+ self.best_gains = torch.zeros(self.num_features, device=self.device)
68
+ self.best_bins = torch.zeros(self.num_features, device=self.device, dtype=torch.int32)
69
+ with torch.no_grad():
70
+ self.forest = self.grow_forest()
103
71
  return self
104
72
 
105
73
  def preprocess_gpu_data(self, X_np, Y_np, era_id_np):
@@ -156,15 +124,24 @@ class WarpGBM(BaseEstimator, RegressorMixin):
156
124
  return grad_hist, hess_hist
157
125
 
158
126
  def find_best_split(self, gradient_histogram, hessian_histogram):
159
- f,b = jit_find_best_split(
127
+ node_kernel.compute_split(
160
128
  gradient_histogram,
161
129
  hessian_histogram,
162
- self.L2_reg,
163
- self.L1_reg,
164
130
  self.min_split_gain,
165
131
  self.min_child_weight,
132
+ self.L2_reg,
133
+ self.best_gains,
134
+ self.best_bins,
135
+ self.threads_per_block
166
136
  )
167
- return (f, b)
137
+
138
+ if torch.all(self.best_bins == -1):
139
+ return -1, -1 # No valid split found
140
+
141
+ f = torch.argmax(self.best_gains).item()
142
+ b = self.best_bins[f].item()
143
+
144
+ return f, b
168
145
 
169
146
  def grow_tree(self, gradient_histogram, hessian_histogram, node_indices, depth):
170
147
  if depth == self.max_depth:
@@ -208,27 +185,26 @@ class WarpGBM(BaseEstimator, RegressorMixin):
208
185
  return { "feature": best_feature, "bin": best_bin, "left": left_child, "right": right_child }
209
186
 
210
187
  def grow_forest(self):
211
- with torch.no_grad():
212
- forest = [{} for _ in range(self.n_estimators)]
213
- self.training_loss = []
214
-
215
- for i in tqdm( range(self.n_estimators) ):
216
- self.residual = self.Y_gpu - self.gradients
217
-
218
- self.root_gradient_histogram, self.root_hessian_histogram = \
219
- self.compute_histograms(self.bin_indices, self.residual)
220
-
221
- tree = self.grow_tree(
222
- self.root_gradient_histogram,
223
- self.root_hessian_histogram,
224
- self.root_node_indices,
225
- depth=0
226
- )
227
- forest[i] = tree
228
- # loss = ((self.Y_gpu - self.gradients) ** 2).mean().item()
229
- # self.training_loss.append(loss)
230
- # print(f"🌲 Tree {i+1}/{self.n_estimators} - MSE: {loss:.6f}")
188
+ forest = [{} for _ in range(self.n_estimators)]
189
+ self.training_loss = []
231
190
 
191
+ for i in tqdm( range(self.n_estimators) ):
192
+ self.residual = self.Y_gpu - self.gradients
193
+
194
+ self.root_gradient_histogram, self.root_hessian_histogram = \
195
+ self.compute_histograms(self.bin_indices, self.residual)
196
+
197
+ tree = self.grow_tree(
198
+ self.root_gradient_histogram,
199
+ self.root_hessian_histogram,
200
+ self.root_node_indices,
201
+ depth=0
202
+ )
203
+ forest[i] = tree
204
+ # loss = ((self.Y_gpu - self.gradients) ** 2).mean().item()
205
+ # self.training_loss.append(loss)
206
+ # print(f"🌲 Tree {i+1}/{self.n_estimators} - MSE: {loss:.6f}")
207
+
232
208
  print("Finished training forest.")
233
209
  return forest
234
210
 
@@ -0,0 +1,79 @@
1
+ #include <torch/extension.h>
2
+ #include <cuda.h>
3
+ #include <cuda_runtime.h>
4
+
5
+ __global__ void best_split_kernel_global_only(
6
+ const float *__restrict__ G, // [F x B]
7
+ const float *__restrict__ H, // [F x B]
8
+ int F,
9
+ int B,
10
+ float min_split_gain,
11
+ float min_child_samples,
12
+ float eps,
13
+ float *__restrict__ best_gains, // [F]
14
+ int *__restrict__ best_bins // [F]
15
+ )
16
+ {
17
+ int f = blockIdx.x * blockDim.x + threadIdx.x;
18
+ if (f >= F)
19
+ return;
20
+
21
+ float G_total = 0.0f, H_total = 0.0f;
22
+ for (int b = 0; b < B; ++b)
23
+ {
24
+ G_total += G[f * B + b];
25
+ H_total += H[f * B + b];
26
+ }
27
+
28
+ float G_L = 0.0f, H_L = 0.0f;
29
+ float best_gain = min_split_gain;
30
+ int best_bin = -1;
31
+
32
+ for (int b = 0; b < B - 1; ++b)
33
+ {
34
+ G_L += G[f * B + b];
35
+ H_L += H[f * B + b];
36
+ float G_R = G_total - G_L;
37
+ float H_R = H_total - H_L;
38
+
39
+ if (H_L >= min_child_samples && H_R >= min_child_samples)
40
+ {
41
+ float gain = (G_L * G_L) / (H_L + eps) + (G_R * G_R) / (H_R + eps);
42
+ if (gain > best_gain)
43
+ {
44
+ best_gain = gain;
45
+ best_bin = b;
46
+ }
47
+ }
48
+ }
49
+
50
+ best_gains[f] = best_gain;
51
+ best_bins[f] = best_bin;
52
+ }
53
+
54
+ void launch_best_split_kernel_cuda(
55
+ const at::Tensor &G, // [F x B]
56
+ const at::Tensor &H, // [F x B]
57
+ float min_split_gain,
58
+ float min_child_samples,
59
+ float eps,
60
+ at::Tensor &best_gains, // [F], float32
61
+ at::Tensor &best_bins, // [F], int32
62
+ int threads)
63
+ {
64
+ int F = G.size(0);
65
+ int B = G.size(1);
66
+
67
+ int blocks = (F + threads - 1) / threads;
68
+
69
+ best_split_kernel_global_only<<<blocks, threads>>>(
70
+ G.data_ptr<float>(),
71
+ H.data_ptr<float>(),
72
+ F,
73
+ B,
74
+ min_split_gain,
75
+ min_child_samples,
76
+ eps,
77
+ best_gains.data_ptr<float>(),
78
+ best_bins.data_ptr<int>());
79
+ }
@@ -21,15 +21,14 @@ void launch_histogram_kernel_cuda_2(
21
21
  int rows_per_thread = 1);
22
22
 
23
23
  void launch_best_split_kernel_cuda(
24
- const at::Tensor &G,
25
- const at::Tensor &H,
26
- int F,
27
- int B,
24
+ const at::Tensor &G, // [F x B]
25
+ const at::Tensor &H, // [F x B]
28
26
  float min_split_gain,
29
27
  float min_child_samples,
30
28
  float eps,
31
- at::Tensor &out_feature,
32
- at::Tensor &out_bin);
29
+ at::Tensor &best_gains, // [F], float32
30
+ at::Tensor &best_bins,
31
+ int threads);
33
32
 
34
33
  void launch_histogram_kernel_cuda_configurable(
35
34
  const at::Tensor &bin_indices,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: warpgbm
3
- Version: 0.1.17
3
+ Version: 0.1.18
4
4
  Summary: A fast GPU-accelerated Gradient Boosted Decision Tree library with PyTorch + CUDA
5
5
  License: GNU GENERAL PUBLIC LICENSE
6
6
  Version 3, 29 June 2007
@@ -706,7 +706,7 @@ WarpGBM is a high-performance, GPU-accelerated Gradient Boosted Decision Tree (G
706
706
 
707
707
  ## Performance Note
708
708
 
709
- In our initial tests on an NVIDIA 3090 (local) and A100 (Google Colab Pro), WarpGBM achieves **14x to 20x faster training times** compared to LightGBM using default configurations. It also consumes **significantly less RAM and CPU**. These early results hint at more thorough benchmarking to come.
709
+ In our initial tests on an NVIDIA 3090 (local) and A100 (Google Colab Pro), WarpGBM achieves **14x to 20x faster training times** compared to LightGBM's CPU version and **2x faster** on the GPU version using default configurations. Speed also outperforms XGBoost and CatBoost on regression problems. It also consumes **significantly less RAM and CPU**. These early results hint at more thorough benchmarking to come.
710
710
 
711
711
  ---
712
712
 
@@ -1,66 +0,0 @@
1
- import numpy as np
2
- from warpgbm import WarpGBM
3
- from sklearn.datasets import make_regression
4
-
5
- def test_fit_predict_correlation():
6
- np.random.seed(42)
7
- N = 1_000_000
8
- F = 100
9
- X, y = make_regression(n_samples=N, n_features=F, noise=0.1, random_state=42)
10
- era = np.zeros(N, dtype=np.int32)
11
- corrs = []
12
-
13
- model = WarpGBM(
14
- max_depth = 10,
15
- num_bins = 10,
16
- n_estimators = 10,
17
- learning_rate = 1,
18
- verbosity=False,
19
- histogram_computer='hist1',
20
- threads_per_block=32,
21
- rows_per_thread=4
22
- )
23
-
24
- model.fit(X, y, era_id=era)
25
- preds = model.predict(X)
26
-
27
- # Pearson correlation in-sample
28
- corr = np.corrcoef(preds, y)[0, 1]
29
- corrs.append(corr)
30
-
31
- model = WarpGBM(
32
- max_depth = 10,
33
- num_bins = 10,
34
- n_estimators = 10,
35
- learning_rate = 1,
36
- verbosity=False,
37
- histogram_computer='hist2',
38
- threads_per_block=32,
39
- rows_per_thread=4
40
- )
41
-
42
- model.fit(X, y, era_id=era)
43
- preds = model.predict(X)
44
-
45
- # Pearson correlation in-sample
46
- corr = np.corrcoef(preds, y)[0, 1]
47
- corrs.append(corr)
48
-
49
- model = WarpGBM(
50
- max_depth = 10,
51
- num_bins = 10,
52
- n_estimators = 10,
53
- learning_rate = 1,
54
- verbosity=False,
55
- histogram_computer='hist3',
56
- threads_per_block=32,
57
- rows_per_thread=4
58
- )
59
-
60
- model.fit(X, y, era_id=era)
61
- preds = model.predict(X)
62
-
63
- # Pearson correlation in-sample
64
- corr = np.corrcoef(preds, y)[0, 1]
65
- corrs.append(corr)
66
- assert ( np.array(corrs) > 0.95 ).all(), f"In-sample correlation too low: {corr:.4f}"
@@ -1 +0,0 @@
1
- 0.1.17
@@ -1,112 +0,0 @@
1
- #include <torch/extension.h>
2
- #include <cuda.h>
3
- #include <cuda_runtime.h>
4
-
5
- __global__ void best_split_kernel(
6
- const float *__restrict__ G, // [F x B]
7
- const float *__restrict__ H, // [F x B]
8
- int F,
9
- int B,
10
- float min_split_gain,
11
- float min_child_samples,
12
- float eps,
13
- int *out_feature,
14
- int *out_bin,
15
- void *shared_mem)
16
- {
17
- int f = blockIdx.x * blockDim.x + threadIdx.x;
18
- if (f >= F)
19
- return;
20
-
21
- // Cast shared memory
22
- extern __shared__ char smem[];
23
- float *gains = reinterpret_cast<float *>(smem);
24
- int *features = reinterpret_cast<int *>(&gains[blockDim.x]);
25
- int *bins = reinterpret_cast<int *>(&features[blockDim.x]);
26
-
27
- // Calculate total G and H for this feature
28
- float G_total = 0.0f, H_total = 0.0f;
29
- for (int b = 0; b < B; ++b)
30
- {
31
- G_total += G[f * B + b];
32
- H_total += H[f * B + b];
33
- }
34
-
35
- float G_L = 0.0f, H_L = 0.0f;
36
- float best_gain = min_split_gain;
37
- int best_bin = -1;
38
-
39
- for (int b = 0; b < B - 1; ++b)
40
- {
41
- G_L += G[f * B + b];
42
- H_L += H[f * B + b];
43
- float G_R = G_total - G_L;
44
- float H_R = H_total - H_L;
45
-
46
- if (H_L > min_child_samples && H_R > min_child_samples)
47
- {
48
- float gain = (G_L * G_L) / (H_L + eps) + (G_R * G_R) / (H_R + eps);
49
- if (gain > best_gain)
50
- {
51
- best_gain = gain;
52
- best_bin = b;
53
- }
54
- }
55
- }
56
-
57
- gains[threadIdx.x] = best_gain;
58
- features[threadIdx.x] = f;
59
- bins[threadIdx.x] = best_bin;
60
- __syncthreads();
61
-
62
- // Thread 0 in each block finds best among its block
63
- if (threadIdx.x == 0)
64
- {
65
- float block_best_gain = min_split_gain;
66
- int block_best_feature = -1;
67
- int block_best_bin = -1;
68
- for (int i = 0; i < blockDim.x && blockIdx.x * blockDim.x + i < F; ++i)
69
- {
70
- if (gains[i] > block_best_gain)
71
- {
72
- block_best_gain = gains[i];
73
- block_best_feature = features[i];
74
- block_best_bin = bins[i];
75
- }
76
- }
77
-
78
- // Write to global outputs
79
- *out_feature = block_best_feature;
80
- *out_bin = block_best_bin;
81
- }
82
- }
83
-
84
- void launch_best_split_kernel_cuda(
85
- const at::Tensor &G,
86
- const at::Tensor &H,
87
- int F,
88
- int B,
89
- float min_split_gain,
90
- float min_child_samples,
91
- float eps,
92
- at::Tensor &out_feature,
93
- at::Tensor &out_bin)
94
- {
95
- int threads = 256;
96
- int blocks = (F + threads - 1) / threads;
97
-
98
- size_t shared_mem_bytes = threads * (sizeof(float) + 2 * sizeof(int));
99
-
100
- best_split_kernel<<<blocks, threads, shared_mem_bytes>>>(
101
- G.data_ptr<float>(),
102
- H.data_ptr<float>(),
103
- F,
104
- B,
105
- min_split_gain,
106
- min_child_samples,
107
- eps,
108
- out_feature.data_ptr<int>(),
109
- out_bin.data_ptr<int>(),
110
- nullptr // shared memory pointer not needed; just launch size
111
- );
112
- }
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes