warpgbm 0.1.15__tar.gz → 0.1.17__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: warpgbm
3
- Version: 0.1.15
3
+ Version: 0.1.17
4
4
  Summary: A fast GPU-accelerated Gradient Boosted Decision Tree library with PyTorch + CUDA
5
5
  License: GNU GENERAL PUBLIC LICENSE
6
6
  Version 3, 29 June 2007
@@ -735,6 +735,17 @@ This installs from PyPI and also compiles CUDA code locally during installation.
735
735
  > pip install warpgbm --no-build-isolation
736
736
  > ```
737
737
 
738
+ ### Windows
739
+
740
+ Thank you, ShatteredX, for providing working instructions for a Windows installation.
741
+
742
+ ```
743
+ git clone https://github.com/jefferythewind/warpgbm.git
744
+ cd warpgbm
745
+ python setup.py bdist_wheel
746
+ pip install .\dist\warpgbm-0.1.15-cp310-cp310-win_amd64.whl
747
+ ```
748
+
738
749
  Before either method, make sure you’ve installed PyTorch with GPU support:\
739
750
  [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)
740
751
 
@@ -47,6 +47,17 @@ This installs from PyPI and also compiles CUDA code locally during installation.
47
47
  > pip install warpgbm --no-build-isolation
48
48
  > ```
49
49
 
50
+ ### Windows
51
+
52
+ Thank you, ShatteredX, for providing working instructions for a Windows installation.
53
+
54
+ ```
55
+ git clone https://github.com/jefferythewind/warpgbm.git
56
+ cd warpgbm
57
+ python setup.py bdist_wheel
58
+ pip install .\dist\warpgbm-0.1.15-cp310-cp310-win_amd64.whl
59
+ ```
60
+
50
61
  Before either method, make sure you’ve installed PyTorch with GPU support:\
51
62
  [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)
52
63
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "warpgbm"
7
- version = "0.1.15"
7
+ version = "0.1.17"
8
8
  description = "A fast GPU-accelerated Gradient Boosted Decision Tree library with PyTorch + CUDA"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -22,6 +22,7 @@ def get_extensions():
22
22
  sources=[
23
23
  "warpgbm/cuda/histogram_kernel.cu",
24
24
  "warpgbm/cuda/best_split_kernel.cu",
25
+ "warpgbm/cuda/binner.cu",
25
26
  "warpgbm/cuda/node_kernel.cpp",
26
27
  ]
27
28
  )
@@ -1,14 +1,12 @@
1
1
  import numpy as np
2
2
  from warpgbm import WarpGBM
3
+ from sklearn.datasets import make_regression
3
4
 
4
5
  def test_fit_predict_correlation():
5
6
  np.random.seed(42)
6
- N = 500
7
- F = 5
8
- X = np.random.randn(N, F).astype(np.float32)
9
- true_weights = np.array([0.5, -1.0, 2.0, 0.0, 1.0])
10
- noise = 0.1 * np.random.randn(N)
11
- y = (X @ true_weights + noise).astype(np.float32)
7
+ N = 1_000_000
8
+ F = 100
9
+ X, y = make_regression(n_samples=N, n_features=F, noise=0.1, random_state=42)
12
10
  era = np.zeros(N, dtype=np.int32)
13
11
  corrs = []
14
12
 
@@ -0,0 +1 @@
1
+ 0.1.17
@@ -3,6 +3,8 @@ import numpy as np
3
3
  from sklearn.base import BaseEstimator, RegressorMixin
4
4
  from warpgbm.cuda import node_kernel
5
5
  from tqdm import tqdm
6
+ from typing import Tuple
7
+ from torch import Tensor
6
8
 
7
9
  histogram_kernels = {
8
10
  'hist1': node_kernel.compute_histogram,
@@ -10,6 +12,36 @@ histogram_kernels = {
10
12
  'hist3': node_kernel.compute_histogram3
11
13
  }
12
14
 
15
+ @torch.jit.script
16
+ def jit_find_best_split(
17
+ G: Tensor, H: Tensor,
18
+ lambda_l2: float,
19
+ lambda_l1: float, # unused placeholder for now
20
+ min_split_gain: float,
21
+ min_child_weight: float
22
+ ) -> Tuple[int, int]:
23
+ F, B = G.size()
24
+ Bm1 = B - 1
25
+
26
+ GH = torch.stack([G, H], dim=0).cumsum(dim=2) # [2, F, B]
27
+ GL, HL = GH[0, :, :-1], GH[1, :, :-1] # [F, B-1]
28
+ GP, HP = GH[0, :, -1:], GH[1, :, -1:] # [F, 1]
29
+ GR = GP - GL
30
+ HR = HP - HL
31
+
32
+ # Validity mask using raw child hessians
33
+ valid = (HL >= min_child_weight) & (HR >= min_child_weight)
34
+ g = (GR**2)/(HR + lambda_l2) + (GL**2)/(HL + lambda_l2) - (GP**2)/(HP + lambda_l2)
35
+ gain = torch.where(valid & (g >= min_split_gain), g, -1.0)
36
+
37
+ gain_flat = gain.view(-1)
38
+ best_idx = torch.argmax(gain_flat)
39
+
40
+ if gain_flat[best_idx].item() == float('-inf'):
41
+ return -1, -1
42
+
43
+ return best_idx // Bm1, best_idx % Bm1
44
+
13
45
  class WarpGBM(BaseEstimator, RegressorMixin):
14
46
  def __init__(
15
47
  self,
@@ -24,6 +56,7 @@ class WarpGBM(BaseEstimator, RegressorMixin):
24
56
  threads_per_block=64,
25
57
  rows_per_thread=4,
26
58
  L2_reg = 1e-6,
59
+ L1_reg = 0.0,
27
60
  device = 'cuda'
28
61
  ):
29
62
  self.num_bins = num_bins
@@ -54,7 +87,7 @@ class WarpGBM(BaseEstimator, RegressorMixin):
54
87
  self.threads_per_block = threads_per_block
55
88
  self.rows_per_thread = rows_per_thread
56
89
  self.L2_reg = L2_reg
57
-
90
+ self.L1_reg = L1_reg
58
91
 
59
92
  def fit(self, X, y, era_id=None):
60
93
  if era_id is None:
@@ -68,42 +101,44 @@ class WarpGBM(BaseEstimator, RegressorMixin):
68
101
  self.split_gains = torch.zeros((self.num_features, self.num_bins - 1), device=self.device)
69
102
  self.forest = self.grow_forest()
70
103
  return self
71
-
72
- def compute_quantile_bins(self, X, num_bins):
73
- quantiles = torch.linspace(0, 1, num_bins + 1)[1:-1] # exclude 0% and 100%
74
- bin_edges = torch.quantile(X, quantiles, dim=0) # shape: [B-1, F]
75
- return bin_edges.T # shape: [F, B-1]
76
104
 
77
105
  def preprocess_gpu_data(self, X_np, Y_np, era_id_np):
78
- self.num_samples, self.num_features = X_np.shape
79
- Y_gpu = torch.from_numpy(Y_np).type(torch.float32).to(self.device)
80
- era_id_gpu = torch.from_numpy(era_id_np).type(torch.int32).to(self.device)
81
- is_integer_type = np.issubdtype(X_np.dtype, np.integer)
82
- if is_integer_type:
83
- max_vals = X_np.max(axis=0)
84
- if np.all(max_vals < self.num_bins):
85
- print("Detected pre-binned integer input — skipping quantile binning.")
86
- bin_indices = torch.from_numpy(X_np).to(self.device).contiguous().to(torch.int8)
87
-
88
- # We'll store None or an empty tensor in self.bin_edges
89
- # to indicate that we skip binning at predict-time
90
- bin_edges = torch.arange(1, self.num_bins, dtype=torch.float32).repeat(self.num_features, 1)
91
- bin_edges = bin_edges.to(self.device)
92
- unique_eras, era_indices = torch.unique(era_id_gpu, return_inverse=True)
93
- return bin_indices, era_indices, bin_edges, unique_eras, Y_gpu
94
- else:
95
- print("Integer input detected, but values exceed num_bins — falling back to quantile binning.")
96
-
97
- print("Performing quantile binning on CPU...")
98
- X_cpu = torch.from_numpy(X_np).type(torch.float32) # CPU tensor
99
- bin_edges_cpu = self.compute_quantile_bins(X_cpu, self.num_bins).type(torch.float32).contiguous()
100
- bin_indices_cpu = torch.empty((self.num_samples, self.num_features), dtype=torch.int8)
101
- for f in range(self.num_features):
102
- bin_indices_cpu[:, f] = torch.bucketize(X_cpu[:, f], bin_edges_cpu[f], right=False).type(torch.int8)
103
- bin_indices = bin_indices_cpu.to(self.device).contiguous()
104
- bin_edges = bin_edges_cpu.to(self.device)
105
- unique_eras, era_indices = torch.unique(era_id_gpu, return_inverse=True)
106
- return bin_indices, era_indices, bin_edges, unique_eras, Y_gpu
106
+ with torch.no_grad():
107
+ self.num_samples, self.num_features = X_np.shape
108
+ Y_gpu = torch.from_numpy(Y_np).type(torch.float32).to(self.device)
109
+ era_id_gpu = torch.from_numpy(era_id_np).type(torch.int32).to(self.device)
110
+ is_integer_type = np.issubdtype(X_np.dtype, np.integer)
111
+ if is_integer_type:
112
+ max_vals = X_np.max(axis=0)
113
+ if np.all(max_vals < self.num_bins):
114
+ print("Detected pre-binned integer input — skipping quantile binning.")
115
+ bin_indices = torch.from_numpy(X_np).to(self.device).contiguous().to(torch.int8)
116
+
117
+ # We'll store None or an empty tensor in self.bin_edges
118
+ # to indicate that we skip binning at predict-time
119
+ bin_edges = torch.arange(1, self.num_bins, dtype=torch.float32).repeat(self.num_features, 1)
120
+ bin_edges = bin_edges.to(self.device)
121
+ unique_eras, era_indices = torch.unique(era_id_gpu, return_inverse=True)
122
+ return bin_indices, era_indices, bin_edges, unique_eras, Y_gpu
123
+ else:
124
+ print("Integer input detected, but values exceed num_bins — falling back to quantile binning.")
125
+
126
+ bin_indices = torch.empty((self.num_samples, self.num_features), dtype=torch.int8, device='cuda')
127
+ bin_edges = torch.empty((self.num_features, self.num_bins - 1), dtype=torch.float32, device='cuda')
128
+
129
+ X_np = torch.from_numpy(X_np).to(torch.float32).pin_memory()
130
+
131
+ for f in range(self.num_features):
132
+ X_f = X_np[:, f].to('cuda', non_blocking=True)
133
+ quantiles = torch.linspace(0, 1, self.num_bins + 1, device='cuda', dtype=X_f.dtype)[1:-1]
134
+ bin_edges_f = torch.quantile(X_f, quantiles, dim=0).contiguous() # shape: [B-1] for 1D input
135
+ bin_indices_f = bin_indices[:, f].contiguous() # view into output
136
+ node_kernel.custom_cuda_binner(X_f, bin_edges_f, bin_indices_f)
137
+ bin_indices[:,f] = bin_indices_f
138
+ bin_edges[f,:] = bin_edges_f
139
+
140
+ unique_eras, era_indices = torch.unique(era_id_gpu, return_inverse=True)
141
+ return bin_indices, era_indices, bin_edges, unique_eras, Y_gpu
107
142
 
108
143
  def compute_histograms(self, bin_indices_sub, gradients):
109
144
  grad_hist = torch.zeros((self.num_features, self.num_bins), device=self.device, dtype=torch.float32)
@@ -121,20 +156,14 @@ class WarpGBM(BaseEstimator, RegressorMixin):
121
156
  return grad_hist, hess_hist
122
157
 
123
158
  def find_best_split(self, gradient_histogram, hessian_histogram):
124
- node_kernel.compute_split(
125
- gradient_histogram.contiguous(),
126
- hessian_histogram.contiguous(),
127
- self.num_features,
128
- self.num_bins,
159
+ f,b = jit_find_best_split(
160
+ gradient_histogram,
161
+ hessian_histogram,
162
+ self.L2_reg,
163
+ self.L1_reg,
129
164
  self.min_split_gain,
130
165
  self.min_child_weight,
131
- self.L2_reg,
132
- self.out_feature,
133
- self.out_bin
134
166
  )
135
-
136
- f = int(self.out_feature[0])
137
- b = int(self.out_bin[0])
138
167
  return (f, b)
139
168
 
140
169
  def grow_tree(self, gradient_histogram, hessian_histogram, node_indices, depth):
@@ -179,24 +208,25 @@ class WarpGBM(BaseEstimator, RegressorMixin):
179
208
  return { "feature": best_feature, "bin": best_bin, "left": left_child, "right": right_child }
180
209
 
181
210
  def grow_forest(self):
182
- forest = [{} for _ in range(self.n_estimators)]
183
- self.training_loss = []
184
-
185
- for i in range(self.n_estimators):
186
- self.residual = self.Y_gpu - self.gradients
187
-
188
- self.root_gradient_histogram, self.root_hessian_histogram = \
189
- self.compute_histograms(self.bin_indices, self.residual)
190
-
191
- tree = self.grow_tree(
192
- self.root_gradient_histogram,
193
- self.root_hessian_histogram,
194
- self.root_node_indices,
195
- depth=0
196
- )
197
- forest[i] = tree
198
- loss = ((self.Y_gpu - self.gradients) ** 2).mean().item()
199
- self.training_loss.append(loss)
211
+ with torch.no_grad():
212
+ forest = [{} for _ in range(self.n_estimators)]
213
+ self.training_loss = []
214
+
215
+ for i in tqdm( range(self.n_estimators) ):
216
+ self.residual = self.Y_gpu - self.gradients
217
+
218
+ self.root_gradient_histogram, self.root_hessian_histogram = \
219
+ self.compute_histograms(self.bin_indices, self.residual)
220
+
221
+ tree = self.grow_tree(
222
+ self.root_gradient_histogram,
223
+ self.root_hessian_histogram,
224
+ self.root_node_indices,
225
+ depth=0
226
+ )
227
+ forest[i] = tree
228
+ # loss = ((self.Y_gpu - self.gradients) ** 2).mean().item()
229
+ # self.training_loss.append(loss)
200
230
  # print(f"🌲 Tree {i+1}/{self.n_estimators} - MSE: {loss:.6f}")
201
231
 
202
232
  print("Finished training forest.")
@@ -208,103 +238,104 @@ class WarpGBM(BaseEstimator, RegressorMixin):
208
238
  We assume `flatten_forest_to_tensors` has produced self.flat_forest with
209
239
  "features", "thresholds", "leaf_values", all shaped [n_trees, max_nodes].
210
240
  """
211
- # 1) Convert X_np -> bin_indices
212
- is_integer_type = np.issubdtype(X_np.dtype, np.integer)
213
- if is_integer_type:
214
- max_vals = X_np.max(axis=0)
215
- if np.all(max_vals < self.num_bins):
216
- bin_indices = X_np.astype(np.int8)
241
+ with torch.no_grad():
242
+ # 1) Convert X_np -> bin_indices
243
+ is_integer_type = np.issubdtype(X_np.dtype, np.integer)
244
+ if is_integer_type:
245
+ max_vals = X_np.max(axis=0)
246
+ if np.all(max_vals < self.num_bins):
247
+ bin_indices = X_np.astype(np.int8)
248
+ else:
249
+ raise ValueError("Pre-binned integers must be < num_bins")
217
250
  else:
218
- raise ValueError("Pre-binned integers must be < num_bins")
219
- else:
220
- X_cpu = torch.from_numpy(X_np).type(torch.float32)
221
- bin_indices = torch.empty((X_np.shape[0], X_np.shape[1]), dtype=torch.int8)
222
- bin_edges_cpu = self.bin_edges.to('cpu')
223
- for f in range(self.num_features):
224
- bin_indices[:, f] = torch.bucketize(X_cpu[:, f], bin_edges_cpu[f], right=False).type(torch.int8)
225
- bin_indices = bin_indices.numpy()
226
-
227
- # 2) Ensure we have a padded representation
228
- self.flat_forest = self.flatten_forest_to_tensors(self.forest)
229
-
230
- features_t = self.flat_forest["features"] # [n_trees, max_nodes], int16
231
- thresholds_t = self.flat_forest["thresholds"] # [n_trees, max_nodes], int16
232
- values_t = self.flat_forest["leaf_values"] # [n_trees, max_nodes], float32
233
- max_nodes = self.flat_forest["max_nodes"]
234
-
235
- n_trees = features_t.shape[0]
236
- N = bin_indices.shape[0]
237
- out = np.zeros(N, dtype=np.float32)
238
-
239
- # 3) Process rows in chunks
240
- for start in tqdm(range(0, N, chunk_size)):
241
- end = min(start + chunk_size, N)
242
- chunk_np = bin_indices[start:end] # shape [chunk_size, F]
243
- chunk_gpu = torch.from_numpy(chunk_np).to(self.device) # [chunk_size, F], int8
244
-
245
- # Accumulate raw (unscaled) leaf sums
246
- chunk_preds = torch.zeros((end - start,), dtype=torch.float32, device=self.device)
247
-
248
- # node_idx[i] tracks the current node index in the padded tree for row i
249
- node_idx = torch.zeros((end - start,), dtype=torch.int32, device=self.device)
250
-
251
- # 'active' is a boolean mask over [0..(end-start-1)], indicating which rows haven't reached a leaf
252
- active = torch.ones((end - start,), dtype=torch.bool, device=self.device)
253
-
254
- for t in range(n_trees):
255
- # Reset for each tree (each tree is independent)
256
- node_idx.fill_(0)
257
- active.fill_(True)
258
-
259
- tree_features = features_t[t] # shape [max_nodes], int16
260
- tree_thresh = thresholds_t[t] # shape [max_nodes], int16
261
- tree_values = values_t[t] # shape [max_nodes], float32
262
-
263
- # Up to self.max_depth+1 layers
264
- for _level in range(self.max_depth + 1):
265
- active_idx = active.nonzero(as_tuple=True)[0]
266
- if active_idx.numel() == 0:
267
- break # all rows are done in this tree
268
-
269
- current_node_idx = node_idx[active_idx]
270
- f = tree_features[current_node_idx] # shape [#active], int16
271
- thr = tree_thresh[current_node_idx] # shape [#active], int16
272
- vals = tree_values[current_node_idx] # shape [#active], float32
273
-
274
- mask_no_node = (f == -2)
275
- mask_leaf = (f == -1)
276
-
277
- # If leaf, add leaf value and mark inactive.
278
- if mask_leaf.any():
279
- leaf_rows = active_idx[mask_leaf]
280
- chunk_preds[leaf_rows] += vals[mask_leaf]
281
- active[leaf_rows] = False
282
-
283
- # If no node, mark inactive.
284
- if mask_no_node.any():
285
- no_node_rows = active_idx[mask_no_node]
286
- active[no_node_rows] = False
287
-
288
- # For internal nodes, perform bin comparison.
289
- mask_internal = (~mask_leaf & ~mask_no_node)
290
- if mask_internal.any():
291
- internal_rows = active_idx[mask_internal]
292
- act_f = f[mask_internal].long()
293
- act_thr = thr[mask_internal]
294
- binvals = chunk_gpu[internal_rows, act_f]
295
- go_left = (binvals <= act_thr)
296
- new_left_idx = current_node_idx[mask_internal] * 2 + 1
297
- new_right_idx = current_node_idx[mask_internal] * 2 + 2
298
- node_idx[internal_rows[go_left]] = new_left_idx[go_left]
299
- node_idx[internal_rows[~go_left]] = new_right_idx[~go_left]
300
- # end per-tree layer loop
301
- # end for each tree
302
-
303
- out[start:end] = (
304
- self.base_prediction + self.learning_rate * chunk_preds
305
- ).cpu().numpy()
306
-
307
- return out
251
+ X_cpu = torch.from_numpy(X_np).type(torch.float32)
252
+ bin_indices = torch.empty((X_np.shape[0], X_np.shape[1]), dtype=torch.int8)
253
+ bin_edges_cpu = self.bin_edges.to('cpu')
254
+ for f in range(self.num_features):
255
+ bin_indices[:, f] = torch.bucketize(X_cpu[:, f], bin_edges_cpu[f], right=False).type(torch.int8)
256
+ bin_indices = bin_indices.numpy()
257
+
258
+ # 2) Ensure we have a padded representation
259
+ self.flat_forest = self.flatten_forest_to_tensors(self.forest)
260
+
261
+ features_t = self.flat_forest["features"] # [n_trees, max_nodes], int16
262
+ thresholds_t = self.flat_forest["thresholds"] # [n_trees, max_nodes], int16
263
+ values_t = self.flat_forest["leaf_values"] # [n_trees, max_nodes], float32
264
+ max_nodes = self.flat_forest["max_nodes"]
265
+
266
+ n_trees = features_t.shape[0]
267
+ N = bin_indices.shape[0]
268
+ out = np.zeros(N, dtype=np.float32)
269
+
270
+ # 3) Process rows in chunks
271
+ for start in tqdm(range(0, N, chunk_size)):
272
+ end = min(start + chunk_size, N)
273
+ chunk_np = bin_indices[start:end] # shape [chunk_size, F]
274
+ chunk_gpu = torch.from_numpy(chunk_np).to(self.device) # [chunk_size, F], int8
275
+
276
+ # Accumulate raw (unscaled) leaf sums
277
+ chunk_preds = torch.zeros((end - start,), dtype=torch.float32, device=self.device)
278
+
279
+ # node_idx[i] tracks the current node index in the padded tree for row i
280
+ node_idx = torch.zeros((end - start,), dtype=torch.int32, device=self.device)
281
+
282
+ # 'active' is a boolean mask over [0..(end-start-1)], indicating which rows haven't reached a leaf
283
+ active = torch.ones((end - start,), dtype=torch.bool, device=self.device)
284
+
285
+ for t in range(n_trees):
286
+ # Reset for each tree (each tree is independent)
287
+ node_idx.fill_(0)
288
+ active.fill_(True)
289
+
290
+ tree_features = features_t[t] # shape [max_nodes], int16
291
+ tree_thresh = thresholds_t[t] # shape [max_nodes], int16
292
+ tree_values = values_t[t] # shape [max_nodes], float32
293
+
294
+ # Up to self.max_depth+1 layers
295
+ for _level in range(self.max_depth + 1):
296
+ active_idx = active.nonzero(as_tuple=True)[0]
297
+ if active_idx.numel() == 0:
298
+ break # all rows are done in this tree
299
+
300
+ current_node_idx = node_idx[active_idx]
301
+ f = tree_features[current_node_idx] # shape [#active], int16
302
+ thr = tree_thresh[current_node_idx] # shape [#active], int16
303
+ vals = tree_values[current_node_idx] # shape [#active], float32
304
+
305
+ mask_no_node = (f == -2)
306
+ mask_leaf = (f == -1)
307
+
308
+ # If leaf, add leaf value and mark inactive.
309
+ if mask_leaf.any():
310
+ leaf_rows = active_idx[mask_leaf]
311
+ chunk_preds[leaf_rows] += vals[mask_leaf]
312
+ active[leaf_rows] = False
313
+
314
+ # If no node, mark inactive.
315
+ if mask_no_node.any():
316
+ no_node_rows = active_idx[mask_no_node]
317
+ active[no_node_rows] = False
318
+
319
+ # For internal nodes, perform bin comparison.
320
+ mask_internal = (~mask_leaf & ~mask_no_node)
321
+ if mask_internal.any():
322
+ internal_rows = active_idx[mask_internal]
323
+ act_f = f[mask_internal].long()
324
+ act_thr = thr[mask_internal]
325
+ binvals = chunk_gpu[internal_rows, act_f]
326
+ go_left = (binvals <= act_thr)
327
+ new_left_idx = current_node_idx[mask_internal] * 2 + 1
328
+ new_right_idx = current_node_idx[mask_internal] * 2 + 2
329
+ node_idx[internal_rows[go_left]] = new_left_idx[go_left]
330
+ node_idx[internal_rows[~go_left]] = new_right_idx[~go_left]
331
+ # end per-tree layer loop
332
+ # end for each tree
333
+
334
+ out[start:end] = (
335
+ self.base_prediction + self.learning_rate * chunk_preds
336
+ ).cpu().numpy()
337
+
338
+ return out
308
339
 
309
340
  def flatten_forest_to_tensors(self, forest):
310
341
  """
@@ -0,0 +1,52 @@
1
+ #include <torch/extension.h>
2
+ #include <cuda.h>
3
+ #include <cuda_runtime.h>
4
+
5
+ __global__ void bin_column_kernel(
6
+ const float *__restrict__ X, // [N]
7
+ const float *__restrict__ bin_edges, // [B - 1]
8
+ int8_t *__restrict__ bin_indices, // [N]
9
+ int N,
10
+ int B_minus1)
11
+ {
12
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
13
+ if (idx >= N)
14
+ return;
15
+
16
+ float val = X[idx];
17
+ int bin = 0;
18
+
19
+ // Linear scan over edges: bin_edges is sorted
20
+ while (bin < B_minus1 && val >= bin_edges[bin])
21
+ {
22
+ ++bin;
23
+ }
24
+
25
+ bin_indices[idx] = static_cast<int8_t>(bin);
26
+ }
27
+
28
+ // C++ launcher for calling from Python
29
+ void launch_bin_column_kernel(
30
+ at::Tensor X, // [N]
31
+ at::Tensor bin_edges, // [B - 1]
32
+ at::Tensor bin_indices // [N]
33
+ )
34
+ {
35
+ const int N = X.size(0);
36
+ const int B = bin_edges.size(0);
37
+
38
+ const int threads = 256;
39
+ const int blocks = (N + threads - 1) / threads;
40
+
41
+ bin_column_kernel<<<blocks, threads>>>(
42
+ X.data_ptr<float>(),
43
+ bin_edges.data_ptr<float>(),
44
+ bin_indices.data_ptr<int8_t>(),
45
+ N,
46
+ B);
47
+
48
+ // Optional: sync and error check
49
+ cudaError_t err = cudaGetLastError();
50
+ if (err != cudaSuccess)
51
+ printf("CUDA error: %s\n", cudaGetErrorString(err));
52
+ }
@@ -40,6 +40,11 @@ void launch_histogram_kernel_cuda_configurable(
40
40
  int threads_per_block = 256,
41
41
  int rows_per_thread = 1);
42
42
 
43
+ void launch_bin_column_kernel(
44
+ at::Tensor X,
45
+ at::Tensor bin_edges,
46
+ at::Tensor bin_indices);
47
+
43
48
  // Bindings
44
49
  PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
45
50
  {
@@ -47,4 +52,5 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
47
52
  m.def("compute_histogram2", &launch_histogram_kernel_cuda_2, "Histogram (CUDA) 2");
48
53
  m.def("compute_histogram3", &launch_histogram_kernel_cuda_configurable, "Histogram Feature Shared Mem");
49
54
  m.def("compute_split", &launch_best_split_kernel_cuda, "Best Split (CUDA)");
55
+ m.def("custom_cuda_binner", &launch_bin_column_kernel, "Custom CUDA binning kernel");
50
56
  }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: warpgbm
3
- Version: 0.1.15
3
+ Version: 0.1.17
4
4
  Summary: A fast GPU-accelerated Gradient Boosted Decision Tree library with PyTorch + CUDA
5
5
  License: GNU GENERAL PUBLIC LICENSE
6
6
  Version 3, 29 June 2007
@@ -735,6 +735,17 @@ This installs from PyPI and also compiles CUDA code locally during installation.
735
735
  > pip install warpgbm --no-build-isolation
736
736
  > ```
737
737
 
738
+ ### Windows
739
+
740
+ Thank you, ShatteredX, for providing working instructions for a Windows installation.
741
+
742
+ ```
743
+ git clone https://github.com/jefferythewind/warpgbm.git
744
+ cd warpgbm
745
+ python setup.py bdist_wheel
746
+ pip install .\dist\warpgbm-0.1.15-cp310-cp310-win_amd64.whl
747
+ ```
748
+
738
749
  Before either method, make sure you’ve installed PyTorch with GPU support:\
739
750
  [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)
740
751
 
@@ -15,5 +15,6 @@ warpgbm.egg-info/requires.txt
15
15
  warpgbm.egg-info/top_level.txt
16
16
  warpgbm/cuda/__init__.py
17
17
  warpgbm/cuda/best_split_kernel.cu
18
+ warpgbm/cuda/binner.cu
18
19
  warpgbm/cuda/histogram_kernel.cu
19
20
  warpgbm/cuda/node_kernel.cpp
@@ -1 +0,0 @@
1
- 0.1.15
File without changes
File without changes
File without changes
File without changes
File without changes