warpgbm 0.1.16__tar.gz → 0.1.17__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {warpgbm-0.1.16/warpgbm.egg-info → warpgbm-0.1.17}/PKG-INFO +1 -1
- {warpgbm-0.1.16 → warpgbm-0.1.17}/pyproject.toml +1 -1
- {warpgbm-0.1.16 → warpgbm-0.1.17}/setup.py +1 -0
- warpgbm-0.1.17/version.txt +1 -0
- {warpgbm-0.1.16 → warpgbm-0.1.17}/warpgbm/core.py +156 -156
- warpgbm-0.1.17/warpgbm/cuda/binner.cu +52 -0
- {warpgbm-0.1.16 → warpgbm-0.1.17}/warpgbm/cuda/node_kernel.cpp +6 -0
- {warpgbm-0.1.16 → warpgbm-0.1.17/warpgbm.egg-info}/PKG-INFO +1 -1
- {warpgbm-0.1.16 → warpgbm-0.1.17}/warpgbm.egg-info/SOURCES.txt +1 -0
- warpgbm-0.1.16/version.txt +0 -1
- {warpgbm-0.1.16 → warpgbm-0.1.17}/LICENSE +0 -0
- {warpgbm-0.1.16 → warpgbm-0.1.17}/MANIFEST.in +0 -0
- {warpgbm-0.1.16 → warpgbm-0.1.17}/README.md +0 -0
- {warpgbm-0.1.16 → warpgbm-0.1.17}/setup.cfg +0 -0
- {warpgbm-0.1.16 → warpgbm-0.1.17}/tests/__init__.py +0 -0
- {warpgbm-0.1.16 → warpgbm-0.1.17}/tests/test_fit_predict_corr.py +0 -0
- {warpgbm-0.1.16 → warpgbm-0.1.17}/warpgbm/__init__.py +0 -0
- {warpgbm-0.1.16 → warpgbm-0.1.17}/warpgbm/cuda/__init__.py +0 -0
- {warpgbm-0.1.16 → warpgbm-0.1.17}/warpgbm/cuda/best_split_kernel.cu +0 -0
- {warpgbm-0.1.16 → warpgbm-0.1.17}/warpgbm/cuda/histogram_kernel.cu +0 -0
- {warpgbm-0.1.16 → warpgbm-0.1.17}/warpgbm.egg-info/dependency_links.txt +0 -0
- {warpgbm-0.1.16 → warpgbm-0.1.17}/warpgbm.egg-info/requires.txt +0 -0
- {warpgbm-0.1.16 → warpgbm-0.1.17}/warpgbm.egg-info/top_level.txt +0 -0
@@ -0,0 +1 @@
|
|
1
|
+
0.1.17
|
@@ -22,21 +22,17 @@ def jit_find_best_split(
|
|
22
22
|
) -> Tuple[int, int]:
|
23
23
|
F, B = G.size()
|
24
24
|
Bm1 = B - 1
|
25
|
-
eps = 0
|
26
25
|
|
27
26
|
GH = torch.stack([G, H], dim=0).cumsum(dim=2) # [2, F, B]
|
28
|
-
GL,
|
27
|
+
GL, HL = GH[0, :, :-1], GH[1, :, :-1] # [F, B-1]
|
29
28
|
GP, HP = GH[0, :, -1:], GH[1, :, -1:] # [F, 1]
|
30
|
-
|
29
|
+
GR = GP - GL
|
30
|
+
HR = HP - HL
|
31
31
|
|
32
32
|
# Validity mask using raw child hessians
|
33
|
-
valid = (
|
34
|
-
|
35
|
-
|
36
|
-
HL, HP = HL_raw + lambda_l2, HP + lambda_l2
|
37
|
-
num = (HP * GL - HL * GP).pow(2)
|
38
|
-
denom = HP * HL * (HP - HL) + eps
|
39
|
-
gain = torch.where(valid & (num / denom >= min_split_gain), num / denom, torch.full_like(num, -float("inf")))
|
33
|
+
valid = (HL >= min_child_weight) & (HR >= min_child_weight)
|
34
|
+
g = (GR**2)/(HR + lambda_l2) + (GL**2)/(HL + lambda_l2) - (GP**2)/(HP + lambda_l2)
|
35
|
+
gain = torch.where(valid & (g >= min_split_gain), g, -1.0)
|
40
36
|
|
41
37
|
gain_flat = gain.view(-1)
|
42
38
|
best_idx = torch.argmax(gain_flat)
|
@@ -105,42 +101,44 @@ class WarpGBM(BaseEstimator, RegressorMixin):
|
|
105
101
|
self.split_gains = torch.zeros((self.num_features, self.num_bins - 1), device=self.device)
|
106
102
|
self.forest = self.grow_forest()
|
107
103
|
return self
|
108
|
-
|
109
|
-
def compute_quantile_bins(self, X, num_bins):
|
110
|
-
quantiles = torch.linspace(0, 1, num_bins + 1)[1:-1] # exclude 0% and 100%
|
111
|
-
bin_edges = torch.quantile(X, quantiles, dim=0) # shape: [B-1, F]
|
112
|
-
return bin_edges.T # shape: [F, B-1]
|
113
104
|
|
114
105
|
def preprocess_gpu_data(self, X_np, Y_np, era_id_np):
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
106
|
+
with torch.no_grad():
|
107
|
+
self.num_samples, self.num_features = X_np.shape
|
108
|
+
Y_gpu = torch.from_numpy(Y_np).type(torch.float32).to(self.device)
|
109
|
+
era_id_gpu = torch.from_numpy(era_id_np).type(torch.int32).to(self.device)
|
110
|
+
is_integer_type = np.issubdtype(X_np.dtype, np.integer)
|
111
|
+
if is_integer_type:
|
112
|
+
max_vals = X_np.max(axis=0)
|
113
|
+
if np.all(max_vals < self.num_bins):
|
114
|
+
print("Detected pre-binned integer input — skipping quantile binning.")
|
115
|
+
bin_indices = torch.from_numpy(X_np).to(self.device).contiguous().to(torch.int8)
|
116
|
+
|
117
|
+
# We'll store None or an empty tensor in self.bin_edges
|
118
|
+
# to indicate that we skip binning at predict-time
|
119
|
+
bin_edges = torch.arange(1, self.num_bins, dtype=torch.float32).repeat(self.num_features, 1)
|
120
|
+
bin_edges = bin_edges.to(self.device)
|
121
|
+
unique_eras, era_indices = torch.unique(era_id_gpu, return_inverse=True)
|
122
|
+
return bin_indices, era_indices, bin_edges, unique_eras, Y_gpu
|
123
|
+
else:
|
124
|
+
print("Integer input detected, but values exceed num_bins — falling back to quantile binning.")
|
125
|
+
|
126
|
+
bin_indices = torch.empty((self.num_samples, self.num_features), dtype=torch.int8, device='cuda')
|
127
|
+
bin_edges = torch.empty((self.num_features, self.num_bins - 1), dtype=torch.float32, device='cuda')
|
128
|
+
|
129
|
+
X_np = torch.from_numpy(X_np).to(torch.float32).pin_memory()
|
130
|
+
|
131
|
+
for f in range(self.num_features):
|
132
|
+
X_f = X_np[:, f].to('cuda', non_blocking=True)
|
133
|
+
quantiles = torch.linspace(0, 1, self.num_bins + 1, device='cuda', dtype=X_f.dtype)[1:-1]
|
134
|
+
bin_edges_f = torch.quantile(X_f, quantiles, dim=0).contiguous() # shape: [B-1] for 1D input
|
135
|
+
bin_indices_f = bin_indices[:, f].contiguous() # view into output
|
136
|
+
node_kernel.custom_cuda_binner(X_f, bin_edges_f, bin_indices_f)
|
137
|
+
bin_indices[:,f] = bin_indices_f
|
138
|
+
bin_edges[f,:] = bin_edges_f
|
139
|
+
|
140
|
+
unique_eras, era_indices = torch.unique(era_id_gpu, return_inverse=True)
|
141
|
+
return bin_indices, era_indices, bin_edges, unique_eras, Y_gpu
|
144
142
|
|
145
143
|
def compute_histograms(self, bin_indices_sub, gradients):
|
146
144
|
grad_hist = torch.zeros((self.num_features, self.num_bins), device=self.device, dtype=torch.float32)
|
@@ -210,22 +208,23 @@ class WarpGBM(BaseEstimator, RegressorMixin):
|
|
210
208
|
return { "feature": best_feature, "bin": best_bin, "left": left_child, "right": right_child }
|
211
209
|
|
212
210
|
def grow_forest(self):
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
self.
|
221
|
-
|
222
|
-
|
223
|
-
self.
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
211
|
+
with torch.no_grad():
|
212
|
+
forest = [{} for _ in range(self.n_estimators)]
|
213
|
+
self.training_loss = []
|
214
|
+
|
215
|
+
for i in tqdm( range(self.n_estimators) ):
|
216
|
+
self.residual = self.Y_gpu - self.gradients
|
217
|
+
|
218
|
+
self.root_gradient_histogram, self.root_hessian_histogram = \
|
219
|
+
self.compute_histograms(self.bin_indices, self.residual)
|
220
|
+
|
221
|
+
tree = self.grow_tree(
|
222
|
+
self.root_gradient_histogram,
|
223
|
+
self.root_hessian_histogram,
|
224
|
+
self.root_node_indices,
|
225
|
+
depth=0
|
226
|
+
)
|
227
|
+
forest[i] = tree
|
229
228
|
# loss = ((self.Y_gpu - self.gradients) ** 2).mean().item()
|
230
229
|
# self.training_loss.append(loss)
|
231
230
|
# print(f"🌲 Tree {i+1}/{self.n_estimators} - MSE: {loss:.6f}")
|
@@ -239,103 +238,104 @@ class WarpGBM(BaseEstimator, RegressorMixin):
|
|
239
238
|
We assume `flatten_forest_to_tensors` has produced self.flat_forest with
|
240
239
|
"features", "thresholds", "leaf_values", all shaped [n_trees, max_nodes].
|
241
240
|
"""
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
241
|
+
with torch.no_grad():
|
242
|
+
# 1) Convert X_np -> bin_indices
|
243
|
+
is_integer_type = np.issubdtype(X_np.dtype, np.integer)
|
244
|
+
if is_integer_type:
|
245
|
+
max_vals = X_np.max(axis=0)
|
246
|
+
if np.all(max_vals < self.num_bins):
|
247
|
+
bin_indices = X_np.astype(np.int8)
|
248
|
+
else:
|
249
|
+
raise ValueError("Pre-binned integers must be < num_bins")
|
248
250
|
else:
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
return out
|
251
|
+
X_cpu = torch.from_numpy(X_np).type(torch.float32)
|
252
|
+
bin_indices = torch.empty((X_np.shape[0], X_np.shape[1]), dtype=torch.int8)
|
253
|
+
bin_edges_cpu = self.bin_edges.to('cpu')
|
254
|
+
for f in range(self.num_features):
|
255
|
+
bin_indices[:, f] = torch.bucketize(X_cpu[:, f], bin_edges_cpu[f], right=False).type(torch.int8)
|
256
|
+
bin_indices = bin_indices.numpy()
|
257
|
+
|
258
|
+
# 2) Ensure we have a padded representation
|
259
|
+
self.flat_forest = self.flatten_forest_to_tensors(self.forest)
|
260
|
+
|
261
|
+
features_t = self.flat_forest["features"] # [n_trees, max_nodes], int16
|
262
|
+
thresholds_t = self.flat_forest["thresholds"] # [n_trees, max_nodes], int16
|
263
|
+
values_t = self.flat_forest["leaf_values"] # [n_trees, max_nodes], float32
|
264
|
+
max_nodes = self.flat_forest["max_nodes"]
|
265
|
+
|
266
|
+
n_trees = features_t.shape[0]
|
267
|
+
N = bin_indices.shape[0]
|
268
|
+
out = np.zeros(N, dtype=np.float32)
|
269
|
+
|
270
|
+
# 3) Process rows in chunks
|
271
|
+
for start in tqdm(range(0, N, chunk_size)):
|
272
|
+
end = min(start + chunk_size, N)
|
273
|
+
chunk_np = bin_indices[start:end] # shape [chunk_size, F]
|
274
|
+
chunk_gpu = torch.from_numpy(chunk_np).to(self.device) # [chunk_size, F], int8
|
275
|
+
|
276
|
+
# Accumulate raw (unscaled) leaf sums
|
277
|
+
chunk_preds = torch.zeros((end - start,), dtype=torch.float32, device=self.device)
|
278
|
+
|
279
|
+
# node_idx[i] tracks the current node index in the padded tree for row i
|
280
|
+
node_idx = torch.zeros((end - start,), dtype=torch.int32, device=self.device)
|
281
|
+
|
282
|
+
# 'active' is a boolean mask over [0..(end-start-1)], indicating which rows haven't reached a leaf
|
283
|
+
active = torch.ones((end - start,), dtype=torch.bool, device=self.device)
|
284
|
+
|
285
|
+
for t in range(n_trees):
|
286
|
+
# Reset for each tree (each tree is independent)
|
287
|
+
node_idx.fill_(0)
|
288
|
+
active.fill_(True)
|
289
|
+
|
290
|
+
tree_features = features_t[t] # shape [max_nodes], int16
|
291
|
+
tree_thresh = thresholds_t[t] # shape [max_nodes], int16
|
292
|
+
tree_values = values_t[t] # shape [max_nodes], float32
|
293
|
+
|
294
|
+
# Up to self.max_depth+1 layers
|
295
|
+
for _level in range(self.max_depth + 1):
|
296
|
+
active_idx = active.nonzero(as_tuple=True)[0]
|
297
|
+
if active_idx.numel() == 0:
|
298
|
+
break # all rows are done in this tree
|
299
|
+
|
300
|
+
current_node_idx = node_idx[active_idx]
|
301
|
+
f = tree_features[current_node_idx] # shape [#active], int16
|
302
|
+
thr = tree_thresh[current_node_idx] # shape [#active], int16
|
303
|
+
vals = tree_values[current_node_idx] # shape [#active], float32
|
304
|
+
|
305
|
+
mask_no_node = (f == -2)
|
306
|
+
mask_leaf = (f == -1)
|
307
|
+
|
308
|
+
# If leaf, add leaf value and mark inactive.
|
309
|
+
if mask_leaf.any():
|
310
|
+
leaf_rows = active_idx[mask_leaf]
|
311
|
+
chunk_preds[leaf_rows] += vals[mask_leaf]
|
312
|
+
active[leaf_rows] = False
|
313
|
+
|
314
|
+
# If no node, mark inactive.
|
315
|
+
if mask_no_node.any():
|
316
|
+
no_node_rows = active_idx[mask_no_node]
|
317
|
+
active[no_node_rows] = False
|
318
|
+
|
319
|
+
# For internal nodes, perform bin comparison.
|
320
|
+
mask_internal = (~mask_leaf & ~mask_no_node)
|
321
|
+
if mask_internal.any():
|
322
|
+
internal_rows = active_idx[mask_internal]
|
323
|
+
act_f = f[mask_internal].long()
|
324
|
+
act_thr = thr[mask_internal]
|
325
|
+
binvals = chunk_gpu[internal_rows, act_f]
|
326
|
+
go_left = (binvals <= act_thr)
|
327
|
+
new_left_idx = current_node_idx[mask_internal] * 2 + 1
|
328
|
+
new_right_idx = current_node_idx[mask_internal] * 2 + 2
|
329
|
+
node_idx[internal_rows[go_left]] = new_left_idx[go_left]
|
330
|
+
node_idx[internal_rows[~go_left]] = new_right_idx[~go_left]
|
331
|
+
# end per-tree layer loop
|
332
|
+
# end for each tree
|
333
|
+
|
334
|
+
out[start:end] = (
|
335
|
+
self.base_prediction + self.learning_rate * chunk_preds
|
336
|
+
).cpu().numpy()
|
337
|
+
|
338
|
+
return out
|
339
339
|
|
340
340
|
def flatten_forest_to_tensors(self, forest):
|
341
341
|
"""
|
@@ -0,0 +1,52 @@
|
|
1
|
+
#include <torch/extension.h>
|
2
|
+
#include <cuda.h>
|
3
|
+
#include <cuda_runtime.h>
|
4
|
+
|
5
|
+
__global__ void bin_column_kernel(
|
6
|
+
const float *__restrict__ X, // [N]
|
7
|
+
const float *__restrict__ bin_edges, // [B - 1]
|
8
|
+
int8_t *__restrict__ bin_indices, // [N]
|
9
|
+
int N,
|
10
|
+
int B_minus1)
|
11
|
+
{
|
12
|
+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
13
|
+
if (idx >= N)
|
14
|
+
return;
|
15
|
+
|
16
|
+
float val = X[idx];
|
17
|
+
int bin = 0;
|
18
|
+
|
19
|
+
// Linear scan over edges: bin_edges is sorted
|
20
|
+
while (bin < B_minus1 && val >= bin_edges[bin])
|
21
|
+
{
|
22
|
+
++bin;
|
23
|
+
}
|
24
|
+
|
25
|
+
bin_indices[idx] = static_cast<int8_t>(bin);
|
26
|
+
}
|
27
|
+
|
28
|
+
// C++ launcher for calling from Python
|
29
|
+
void launch_bin_column_kernel(
|
30
|
+
at::Tensor X, // [N]
|
31
|
+
at::Tensor bin_edges, // [B - 1]
|
32
|
+
at::Tensor bin_indices // [N]
|
33
|
+
)
|
34
|
+
{
|
35
|
+
const int N = X.size(0);
|
36
|
+
const int B = bin_edges.size(0);
|
37
|
+
|
38
|
+
const int threads = 256;
|
39
|
+
const int blocks = (N + threads - 1) / threads;
|
40
|
+
|
41
|
+
bin_column_kernel<<<blocks, threads>>>(
|
42
|
+
X.data_ptr<float>(),
|
43
|
+
bin_edges.data_ptr<float>(),
|
44
|
+
bin_indices.data_ptr<int8_t>(),
|
45
|
+
N,
|
46
|
+
B);
|
47
|
+
|
48
|
+
// Optional: sync and error check
|
49
|
+
cudaError_t err = cudaGetLastError();
|
50
|
+
if (err != cudaSuccess)
|
51
|
+
printf("CUDA error: %s\n", cudaGetErrorString(err));
|
52
|
+
}
|
@@ -40,6 +40,11 @@ void launch_histogram_kernel_cuda_configurable(
|
|
40
40
|
int threads_per_block = 256,
|
41
41
|
int rows_per_thread = 1);
|
42
42
|
|
43
|
+
void launch_bin_column_kernel(
|
44
|
+
at::Tensor X,
|
45
|
+
at::Tensor bin_edges,
|
46
|
+
at::Tensor bin_indices);
|
47
|
+
|
43
48
|
// Bindings
|
44
49
|
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
|
45
50
|
{
|
@@ -47,4 +52,5 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
|
|
47
52
|
m.def("compute_histogram2", &launch_histogram_kernel_cuda_2, "Histogram (CUDA) 2");
|
48
53
|
m.def("compute_histogram3", &launch_histogram_kernel_cuda_configurable, "Histogram Feature Shared Mem");
|
49
54
|
m.def("compute_split", &launch_best_split_kernel_cuda, "Best Split (CUDA)");
|
55
|
+
m.def("custom_cuda_binner", &launch_bin_column_kernel, "Custom CUDA binning kernel");
|
50
56
|
}
|
warpgbm-0.1.16/version.txt
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
0.1.16
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|