warpgbm 0.1.16__tar.gz → 0.1.17__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: warpgbm
3
- Version: 0.1.16
3
+ Version: 0.1.17
4
4
  Summary: A fast GPU-accelerated Gradient Boosted Decision Tree library with PyTorch + CUDA
5
5
  License: GNU GENERAL PUBLIC LICENSE
6
6
  Version 3, 29 June 2007
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "warpgbm"
7
- version = "0.1.16"
7
+ version = "0.1.17"
8
8
  description = "A fast GPU-accelerated Gradient Boosted Decision Tree library with PyTorch + CUDA"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -22,6 +22,7 @@ def get_extensions():
22
22
  sources=[
23
23
  "warpgbm/cuda/histogram_kernel.cu",
24
24
  "warpgbm/cuda/best_split_kernel.cu",
25
+ "warpgbm/cuda/binner.cu",
25
26
  "warpgbm/cuda/node_kernel.cpp",
26
27
  ]
27
28
  )
@@ -0,0 +1 @@
1
+ 0.1.17
@@ -22,21 +22,17 @@ def jit_find_best_split(
22
22
  ) -> Tuple[int, int]:
23
23
  F, B = G.size()
24
24
  Bm1 = B - 1
25
- eps = 0
26
25
 
27
26
  GH = torch.stack([G, H], dim=0).cumsum(dim=2) # [2, F, B]
28
- GL, HL_raw = GH[0, :, :-1], GH[1, :, :-1] # [F, B-1]
27
+ GL, HL = GH[0, :, :-1], GH[1, :, :-1] # [F, B-1]
29
28
  GP, HP = GH[0, :, -1:], GH[1, :, -1:] # [F, 1]
30
- H_R_raw = HP - HL_raw
29
+ GR = GP - GL
30
+ HR = HP - HL
31
31
 
32
32
  # Validity mask using raw child hessians
33
- valid = (HL_raw >= min_child_weight) & (H_R_raw >= min_child_weight)
34
-
35
- # Closed-form gain
36
- HL, HP = HL_raw + lambda_l2, HP + lambda_l2
37
- num = (HP * GL - HL * GP).pow(2)
38
- denom = HP * HL * (HP - HL) + eps
39
- gain = torch.where(valid & (num / denom >= min_split_gain), num / denom, torch.full_like(num, -float("inf")))
33
+ valid = (HL >= min_child_weight) & (HR >= min_child_weight)
34
+ g = (GR**2)/(HR + lambda_l2) + (GL**2)/(HL + lambda_l2) - (GP**2)/(HP + lambda_l2)
35
+ gain = torch.where(valid & (g >= min_split_gain), g, -1.0)
40
36
 
41
37
  gain_flat = gain.view(-1)
42
38
  best_idx = torch.argmax(gain_flat)
@@ -105,42 +101,44 @@ class WarpGBM(BaseEstimator, RegressorMixin):
105
101
  self.split_gains = torch.zeros((self.num_features, self.num_bins - 1), device=self.device)
106
102
  self.forest = self.grow_forest()
107
103
  return self
108
-
109
- def compute_quantile_bins(self, X, num_bins):
110
- quantiles = torch.linspace(0, 1, num_bins + 1)[1:-1] # exclude 0% and 100%
111
- bin_edges = torch.quantile(X, quantiles, dim=0) # shape: [B-1, F]
112
- return bin_edges.T # shape: [F, B-1]
113
104
 
114
105
  def preprocess_gpu_data(self, X_np, Y_np, era_id_np):
115
- self.num_samples, self.num_features = X_np.shape
116
- Y_gpu = torch.from_numpy(Y_np).type(torch.float32).to(self.device)
117
- era_id_gpu = torch.from_numpy(era_id_np).type(torch.int32).to(self.device)
118
- is_integer_type = np.issubdtype(X_np.dtype, np.integer)
119
- if is_integer_type:
120
- max_vals = X_np.max(axis=0)
121
- if np.all(max_vals < self.num_bins):
122
- print("Detected pre-binned integer input — skipping quantile binning.")
123
- bin_indices = torch.from_numpy(X_np).to(self.device).contiguous().to(torch.int8)
124
-
125
- # We'll store None or an empty tensor in self.bin_edges
126
- # to indicate that we skip binning at predict-time
127
- bin_edges = torch.arange(1, self.num_bins, dtype=torch.float32).repeat(self.num_features, 1)
128
- bin_edges = bin_edges.to(self.device)
129
- unique_eras, era_indices = torch.unique(era_id_gpu, return_inverse=True)
130
- return bin_indices, era_indices, bin_edges, unique_eras, Y_gpu
131
- else:
132
- print("Integer input detected, but values exceed num_bins — falling back to quantile binning.")
133
-
134
- print("Performing quantile binning on CPU...")
135
- X_cpu = torch.from_numpy(X_np).type(torch.float32) # CPU tensor
136
- bin_edges_cpu = self.compute_quantile_bins(X_cpu, self.num_bins).type(torch.float32).contiguous()
137
- bin_indices_cpu = torch.empty((self.num_samples, self.num_features), dtype=torch.int8)
138
- for f in range(self.num_features):
139
- bin_indices_cpu[:, f] = torch.bucketize(X_cpu[:, f], bin_edges_cpu[f], right=False).type(torch.int8)
140
- bin_indices = bin_indices_cpu.to(self.device).contiguous()
141
- bin_edges = bin_edges_cpu.to(self.device)
142
- unique_eras, era_indices = torch.unique(era_id_gpu, return_inverse=True)
143
- return bin_indices, era_indices, bin_edges, unique_eras, Y_gpu
106
+ with torch.no_grad():
107
+ self.num_samples, self.num_features = X_np.shape
108
+ Y_gpu = torch.from_numpy(Y_np).type(torch.float32).to(self.device)
109
+ era_id_gpu = torch.from_numpy(era_id_np).type(torch.int32).to(self.device)
110
+ is_integer_type = np.issubdtype(X_np.dtype, np.integer)
111
+ if is_integer_type:
112
+ max_vals = X_np.max(axis=0)
113
+ if np.all(max_vals < self.num_bins):
114
+ print("Detected pre-binned integer input — skipping quantile binning.")
115
+ bin_indices = torch.from_numpy(X_np).to(self.device).contiguous().to(torch.int8)
116
+
117
+ # We'll store None or an empty tensor in self.bin_edges
118
+ # to indicate that we skip binning at predict-time
119
+ bin_edges = torch.arange(1, self.num_bins, dtype=torch.float32).repeat(self.num_features, 1)
120
+ bin_edges = bin_edges.to(self.device)
121
+ unique_eras, era_indices = torch.unique(era_id_gpu, return_inverse=True)
122
+ return bin_indices, era_indices, bin_edges, unique_eras, Y_gpu
123
+ else:
124
+ print("Integer input detected, but values exceed num_bins — falling back to quantile binning.")
125
+
126
+ bin_indices = torch.empty((self.num_samples, self.num_features), dtype=torch.int8, device='cuda')
127
+ bin_edges = torch.empty((self.num_features, self.num_bins - 1), dtype=torch.float32, device='cuda')
128
+
129
+ X_np = torch.from_numpy(X_np).to(torch.float32).pin_memory()
130
+
131
+ for f in range(self.num_features):
132
+ X_f = X_np[:, f].to('cuda', non_blocking=True)
133
+ quantiles = torch.linspace(0, 1, self.num_bins + 1, device='cuda', dtype=X_f.dtype)[1:-1]
134
+ bin_edges_f = torch.quantile(X_f, quantiles, dim=0).contiguous() # shape: [B-1] for 1D input
135
+ bin_indices_f = bin_indices[:, f].contiguous() # view into output
136
+ node_kernel.custom_cuda_binner(X_f, bin_edges_f, bin_indices_f)
137
+ bin_indices[:,f] = bin_indices_f
138
+ bin_edges[f,:] = bin_edges_f
139
+
140
+ unique_eras, era_indices = torch.unique(era_id_gpu, return_inverse=True)
141
+ return bin_indices, era_indices, bin_edges, unique_eras, Y_gpu
144
142
 
145
143
  def compute_histograms(self, bin_indices_sub, gradients):
146
144
  grad_hist = torch.zeros((self.num_features, self.num_bins), device=self.device, dtype=torch.float32)
@@ -210,22 +208,23 @@ class WarpGBM(BaseEstimator, RegressorMixin):
210
208
  return { "feature": best_feature, "bin": best_bin, "left": left_child, "right": right_child }
211
209
 
212
210
  def grow_forest(self):
213
- forest = [{} for _ in range(self.n_estimators)]
214
- self.training_loss = []
215
-
216
- for i in tqdm( range(self.n_estimators) ):
217
- self.residual = self.Y_gpu - self.gradients
218
-
219
- self.root_gradient_histogram, self.root_hessian_histogram = \
220
- self.compute_histograms(self.bin_indices, self.residual)
221
-
222
- tree = self.grow_tree(
223
- self.root_gradient_histogram,
224
- self.root_hessian_histogram,
225
- self.root_node_indices,
226
- depth=0
227
- )
228
- forest[i] = tree
211
+ with torch.no_grad():
212
+ forest = [{} for _ in range(self.n_estimators)]
213
+ self.training_loss = []
214
+
215
+ for i in tqdm( range(self.n_estimators) ):
216
+ self.residual = self.Y_gpu - self.gradients
217
+
218
+ self.root_gradient_histogram, self.root_hessian_histogram = \
219
+ self.compute_histograms(self.bin_indices, self.residual)
220
+
221
+ tree = self.grow_tree(
222
+ self.root_gradient_histogram,
223
+ self.root_hessian_histogram,
224
+ self.root_node_indices,
225
+ depth=0
226
+ )
227
+ forest[i] = tree
229
228
  # loss = ((self.Y_gpu - self.gradients) ** 2).mean().item()
230
229
  # self.training_loss.append(loss)
231
230
  # print(f"🌲 Tree {i+1}/{self.n_estimators} - MSE: {loss:.6f}")
@@ -239,103 +238,104 @@ class WarpGBM(BaseEstimator, RegressorMixin):
239
238
  We assume `flatten_forest_to_tensors` has produced self.flat_forest with
240
239
  "features", "thresholds", "leaf_values", all shaped [n_trees, max_nodes].
241
240
  """
242
- # 1) Convert X_np -> bin_indices
243
- is_integer_type = np.issubdtype(X_np.dtype, np.integer)
244
- if is_integer_type:
245
- max_vals = X_np.max(axis=0)
246
- if np.all(max_vals < self.num_bins):
247
- bin_indices = X_np.astype(np.int8)
241
+ with torch.no_grad():
242
+ # 1) Convert X_np -> bin_indices
243
+ is_integer_type = np.issubdtype(X_np.dtype, np.integer)
244
+ if is_integer_type:
245
+ max_vals = X_np.max(axis=0)
246
+ if np.all(max_vals < self.num_bins):
247
+ bin_indices = X_np.astype(np.int8)
248
+ else:
249
+ raise ValueError("Pre-binned integers must be < num_bins")
248
250
  else:
249
- raise ValueError("Pre-binned integers must be < num_bins")
250
- else:
251
- X_cpu = torch.from_numpy(X_np).type(torch.float32)
252
- bin_indices = torch.empty((X_np.shape[0], X_np.shape[1]), dtype=torch.int8)
253
- bin_edges_cpu = self.bin_edges.to('cpu')
254
- for f in range(self.num_features):
255
- bin_indices[:, f] = torch.bucketize(X_cpu[:, f], bin_edges_cpu[f], right=False).type(torch.int8)
256
- bin_indices = bin_indices.numpy()
257
-
258
- # 2) Ensure we have a padded representation
259
- self.flat_forest = self.flatten_forest_to_tensors(self.forest)
260
-
261
- features_t = self.flat_forest["features"] # [n_trees, max_nodes], int16
262
- thresholds_t = self.flat_forest["thresholds"] # [n_trees, max_nodes], int16
263
- values_t = self.flat_forest["leaf_values"] # [n_trees, max_nodes], float32
264
- max_nodes = self.flat_forest["max_nodes"]
265
-
266
- n_trees = features_t.shape[0]
267
- N = bin_indices.shape[0]
268
- out = np.zeros(N, dtype=np.float32)
269
-
270
- # 3) Process rows in chunks
271
- for start in tqdm(range(0, N, chunk_size)):
272
- end = min(start + chunk_size, N)
273
- chunk_np = bin_indices[start:end] # shape [chunk_size, F]
274
- chunk_gpu = torch.from_numpy(chunk_np).to(self.device) # [chunk_size, F], int8
275
-
276
- # Accumulate raw (unscaled) leaf sums
277
- chunk_preds = torch.zeros((end - start,), dtype=torch.float32, device=self.device)
278
-
279
- # node_idx[i] tracks the current node index in the padded tree for row i
280
- node_idx = torch.zeros((end - start,), dtype=torch.int32, device=self.device)
281
-
282
- # 'active' is a boolean mask over [0..(end-start-1)], indicating which rows haven't reached a leaf
283
- active = torch.ones((end - start,), dtype=torch.bool, device=self.device)
284
-
285
- for t in range(n_trees):
286
- # Reset for each tree (each tree is independent)
287
- node_idx.fill_(0)
288
- active.fill_(True)
289
-
290
- tree_features = features_t[t] # shape [max_nodes], int16
291
- tree_thresh = thresholds_t[t] # shape [max_nodes], int16
292
- tree_values = values_t[t] # shape [max_nodes], float32
293
-
294
- # Up to self.max_depth+1 layers
295
- for _level in range(self.max_depth + 1):
296
- active_idx = active.nonzero(as_tuple=True)[0]
297
- if active_idx.numel() == 0:
298
- break # all rows are done in this tree
299
-
300
- current_node_idx = node_idx[active_idx]
301
- f = tree_features[current_node_idx] # shape [#active], int16
302
- thr = tree_thresh[current_node_idx] # shape [#active], int16
303
- vals = tree_values[current_node_idx] # shape [#active], float32
304
-
305
- mask_no_node = (f == -2)
306
- mask_leaf = (f == -1)
307
-
308
- # If leaf, add leaf value and mark inactive.
309
- if mask_leaf.any():
310
- leaf_rows = active_idx[mask_leaf]
311
- chunk_preds[leaf_rows] += vals[mask_leaf]
312
- active[leaf_rows] = False
313
-
314
- # If no node, mark inactive.
315
- if mask_no_node.any():
316
- no_node_rows = active_idx[mask_no_node]
317
- active[no_node_rows] = False
318
-
319
- # For internal nodes, perform bin comparison.
320
- mask_internal = (~mask_leaf & ~mask_no_node)
321
- if mask_internal.any():
322
- internal_rows = active_idx[mask_internal]
323
- act_f = f[mask_internal].long()
324
- act_thr = thr[mask_internal]
325
- binvals = chunk_gpu[internal_rows, act_f]
326
- go_left = (binvals <= act_thr)
327
- new_left_idx = current_node_idx[mask_internal] * 2 + 1
328
- new_right_idx = current_node_idx[mask_internal] * 2 + 2
329
- node_idx[internal_rows[go_left]] = new_left_idx[go_left]
330
- node_idx[internal_rows[~go_left]] = new_right_idx[~go_left]
331
- # end per-tree layer loop
332
- # end for each tree
333
-
334
- out[start:end] = (
335
- self.base_prediction + self.learning_rate * chunk_preds
336
- ).cpu().numpy()
337
-
338
- return out
251
+ X_cpu = torch.from_numpy(X_np).type(torch.float32)
252
+ bin_indices = torch.empty((X_np.shape[0], X_np.shape[1]), dtype=torch.int8)
253
+ bin_edges_cpu = self.bin_edges.to('cpu')
254
+ for f in range(self.num_features):
255
+ bin_indices[:, f] = torch.bucketize(X_cpu[:, f], bin_edges_cpu[f], right=False).type(torch.int8)
256
+ bin_indices = bin_indices.numpy()
257
+
258
+ # 2) Ensure we have a padded representation
259
+ self.flat_forest = self.flatten_forest_to_tensors(self.forest)
260
+
261
+ features_t = self.flat_forest["features"] # [n_trees, max_nodes], int16
262
+ thresholds_t = self.flat_forest["thresholds"] # [n_trees, max_nodes], int16
263
+ values_t = self.flat_forest["leaf_values"] # [n_trees, max_nodes], float32
264
+ max_nodes = self.flat_forest["max_nodes"]
265
+
266
+ n_trees = features_t.shape[0]
267
+ N = bin_indices.shape[0]
268
+ out = np.zeros(N, dtype=np.float32)
269
+
270
+ # 3) Process rows in chunks
271
+ for start in tqdm(range(0, N, chunk_size)):
272
+ end = min(start + chunk_size, N)
273
+ chunk_np = bin_indices[start:end] # shape [chunk_size, F]
274
+ chunk_gpu = torch.from_numpy(chunk_np).to(self.device) # [chunk_size, F], int8
275
+
276
+ # Accumulate raw (unscaled) leaf sums
277
+ chunk_preds = torch.zeros((end - start,), dtype=torch.float32, device=self.device)
278
+
279
+ # node_idx[i] tracks the current node index in the padded tree for row i
280
+ node_idx = torch.zeros((end - start,), dtype=torch.int32, device=self.device)
281
+
282
+ # 'active' is a boolean mask over [0..(end-start-1)], indicating which rows haven't reached a leaf
283
+ active = torch.ones((end - start,), dtype=torch.bool, device=self.device)
284
+
285
+ for t in range(n_trees):
286
+ # Reset for each tree (each tree is independent)
287
+ node_idx.fill_(0)
288
+ active.fill_(True)
289
+
290
+ tree_features = features_t[t] # shape [max_nodes], int16
291
+ tree_thresh = thresholds_t[t] # shape [max_nodes], int16
292
+ tree_values = values_t[t] # shape [max_nodes], float32
293
+
294
+ # Up to self.max_depth+1 layers
295
+ for _level in range(self.max_depth + 1):
296
+ active_idx = active.nonzero(as_tuple=True)[0]
297
+ if active_idx.numel() == 0:
298
+ break # all rows are done in this tree
299
+
300
+ current_node_idx = node_idx[active_idx]
301
+ f = tree_features[current_node_idx] # shape [#active], int16
302
+ thr = tree_thresh[current_node_idx] # shape [#active], int16
303
+ vals = tree_values[current_node_idx] # shape [#active], float32
304
+
305
+ mask_no_node = (f == -2)
306
+ mask_leaf = (f == -1)
307
+
308
+ # If leaf, add leaf value and mark inactive.
309
+ if mask_leaf.any():
310
+ leaf_rows = active_idx[mask_leaf]
311
+ chunk_preds[leaf_rows] += vals[mask_leaf]
312
+ active[leaf_rows] = False
313
+
314
+ # If no node, mark inactive.
315
+ if mask_no_node.any():
316
+ no_node_rows = active_idx[mask_no_node]
317
+ active[no_node_rows] = False
318
+
319
+ # For internal nodes, perform bin comparison.
320
+ mask_internal = (~mask_leaf & ~mask_no_node)
321
+ if mask_internal.any():
322
+ internal_rows = active_idx[mask_internal]
323
+ act_f = f[mask_internal].long()
324
+ act_thr = thr[mask_internal]
325
+ binvals = chunk_gpu[internal_rows, act_f]
326
+ go_left = (binvals <= act_thr)
327
+ new_left_idx = current_node_idx[mask_internal] * 2 + 1
328
+ new_right_idx = current_node_idx[mask_internal] * 2 + 2
329
+ node_idx[internal_rows[go_left]] = new_left_idx[go_left]
330
+ node_idx[internal_rows[~go_left]] = new_right_idx[~go_left]
331
+ # end per-tree layer loop
332
+ # end for each tree
333
+
334
+ out[start:end] = (
335
+ self.base_prediction + self.learning_rate * chunk_preds
336
+ ).cpu().numpy()
337
+
338
+ return out
339
339
 
340
340
  def flatten_forest_to_tensors(self, forest):
341
341
  """
@@ -0,0 +1,52 @@
1
+ #include <torch/extension.h>
2
+ #include <cuda.h>
3
+ #include <cuda_runtime.h>
4
+
5
+ __global__ void bin_column_kernel(
6
+ const float *__restrict__ X, // [N]
7
+ const float *__restrict__ bin_edges, // [B - 1]
8
+ int8_t *__restrict__ bin_indices, // [N]
9
+ int N,
10
+ int B_minus1)
11
+ {
12
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
13
+ if (idx >= N)
14
+ return;
15
+
16
+ float val = X[idx];
17
+ int bin = 0;
18
+
19
+ // Linear scan over edges: bin_edges is sorted
20
+ while (bin < B_minus1 && val >= bin_edges[bin])
21
+ {
22
+ ++bin;
23
+ }
24
+
25
+ bin_indices[idx] = static_cast<int8_t>(bin);
26
+ }
27
+
28
+ // C++ launcher for calling from Python
29
+ void launch_bin_column_kernel(
30
+ at::Tensor X, // [N]
31
+ at::Tensor bin_edges, // [B - 1]
32
+ at::Tensor bin_indices // [N]
33
+ )
34
+ {
35
+ const int N = X.size(0);
36
+ const int B = bin_edges.size(0);
37
+
38
+ const int threads = 256;
39
+ const int blocks = (N + threads - 1) / threads;
40
+
41
+ bin_column_kernel<<<blocks, threads>>>(
42
+ X.data_ptr<float>(),
43
+ bin_edges.data_ptr<float>(),
44
+ bin_indices.data_ptr<int8_t>(),
45
+ N,
46
+ B);
47
+
48
+ // Optional: sync and error check
49
+ cudaError_t err = cudaGetLastError();
50
+ if (err != cudaSuccess)
51
+ printf("CUDA error: %s\n", cudaGetErrorString(err));
52
+ }
@@ -40,6 +40,11 @@ void launch_histogram_kernel_cuda_configurable(
40
40
  int threads_per_block = 256,
41
41
  int rows_per_thread = 1);
42
42
 
43
+ void launch_bin_column_kernel(
44
+ at::Tensor X,
45
+ at::Tensor bin_edges,
46
+ at::Tensor bin_indices);
47
+
43
48
  // Bindings
44
49
  PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
45
50
  {
@@ -47,4 +52,5 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
47
52
  m.def("compute_histogram2", &launch_histogram_kernel_cuda_2, "Histogram (CUDA) 2");
48
53
  m.def("compute_histogram3", &launch_histogram_kernel_cuda_configurable, "Histogram Feature Shared Mem");
49
54
  m.def("compute_split", &launch_best_split_kernel_cuda, "Best Split (CUDA)");
55
+ m.def("custom_cuda_binner", &launch_bin_column_kernel, "Custom CUDA binning kernel");
50
56
  }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: warpgbm
3
- Version: 0.1.16
3
+ Version: 0.1.17
4
4
  Summary: A fast GPU-accelerated Gradient Boosted Decision Tree library with PyTorch + CUDA
5
5
  License: GNU GENERAL PUBLIC LICENSE
6
6
  Version 3, 29 June 2007
@@ -15,5 +15,6 @@ warpgbm.egg-info/requires.txt
15
15
  warpgbm.egg-info/top_level.txt
16
16
  warpgbm/cuda/__init__.py
17
17
  warpgbm/cuda/best_split_kernel.cu
18
+ warpgbm/cuda/binner.cu
18
19
  warpgbm/cuda/histogram_kernel.cu
19
20
  warpgbm/cuda/node_kernel.cpp
@@ -1 +0,0 @@
1
- 0.1.16
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes