PyPI - warpgbm - Versions diffs - 0.1.15__tar.gz → 0.1.17__tar.gz - Mend

warpgbm 0.1.15tar.gz → 0.1.17tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

{warpgbm-0.1.15/warpgbm.egg-info → warpgbm-0.1.17}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: warpgbm
-Version: 0.1.15
+Version: 0.1.17
 Summary: A fast GPU-accelerated Gradient Boosted Decision Tree library with PyTorch + CUDA
 License:                     GNU GENERAL PUBLIC LICENSE
                                Version 3, 29 June 2007
@@ -735,6 +735,17 @@ This installs from PyPI and also compiles CUDA code locally during installation.
 > pip install warpgbm --no-build-isolation
 > ```
+### Windows
+Thank you, ShatteredX, for providing working instructions for a Windows installation.
+```
+git clone https://github.com/jefferythewind/warpgbm.git
+cd warpgbm
+python setup.py bdist_wheel
+pip install .\dist\warpgbm-0.1.15-cp310-cp310-win_amd64.whl
+```
 Before either method, make sure you’ve installed PyTorch with GPU support:\
 [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)

{warpgbm-0.1.15 → warpgbm-0.1.17}/README.md RENAMED Viewed

@@ -47,6 +47,17 @@ This installs from PyPI and also compiles CUDA code locally during installation.
 > pip install warpgbm --no-build-isolation
 > ```
+### Windows
+Thank you, ShatteredX, for providing working instructions for a Windows installation.
+```
+git clone https://github.com/jefferythewind/warpgbm.git
+cd warpgbm
+python setup.py bdist_wheel
+pip install .\dist\warpgbm-0.1.15-cp310-cp310-win_amd64.whl
+```
 Before either method, make sure you’ve installed PyTorch with GPU support:\
 [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)

{warpgbm-0.1.15 → warpgbm-0.1.17}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "warpgbm"
-version = "0.1.15"
+version = "0.1.17"
 description = "A fast GPU-accelerated Gradient Boosted Decision Tree library with PyTorch + CUDA"
 readme = "README.md"
 requires-python = ">=3.8"

{warpgbm-0.1.15 → warpgbm-0.1.17}/setup.py RENAMED Viewed

@@ -22,6 +22,7 @@ def get_extensions():
                 sources=[
                     "warpgbm/cuda/histogram_kernel.cu",
                     "warpgbm/cuda/best_split_kernel.cu",
+                    "warpgbm/cuda/binner.cu",
                     "warpgbm/cuda/node_kernel.cpp",
                 ]
             )

{warpgbm-0.1.15 → warpgbm-0.1.17}/tests/test_fit_predict_corr.py RENAMED Viewed

@@ -1,14 +1,12 @@
 import numpy as np
 from warpgbm import WarpGBM
+from sklearn.datasets import make_regression
 def test_fit_predict_correlation():
     np.random.seed(42)
-    N = 500
-    F = 5
-    X = np.random.randn(N, F).astype(np.float32)
-    true_weights = np.array([0.5, -1.0, 2.0, 0.0, 1.0])
-    noise = 0.1 * np.random.randn(N)
-    y = (X @ true_weights + noise).astype(np.float32)
+    N = 1_000_000
+    F = 100
+    X, y = make_regression(n_samples=N, n_features=F, noise=0.1, random_state=42)
     era = np.zeros(N, dtype=np.int32)
     corrs = []

warpgbm-0.1.17/version.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.1.17

{warpgbm-0.1.15 → warpgbm-0.1.17}/warpgbm/core.py RENAMED Viewed

@@ -3,6 +3,8 @@ import numpy as np
 from sklearn.base import BaseEstimator, RegressorMixin
 from warpgbm.cuda import node_kernel
 from tqdm import tqdm
+from typing import Tuple
+from torch import Tensor
 histogram_kernels = {
     'hist1': node_kernel.compute_histogram,
@@ -10,6 +12,36 @@ histogram_kernels = {
     'hist3': node_kernel.compute_histogram3
 }
+@torch.jit.script
+def jit_find_best_split(
+    G: Tensor, H: Tensor,
+    lambda_l2: float,
+    lambda_l1: float,  # unused placeholder for now
+    min_split_gain: float,
+    min_child_weight: float
+) -> Tuple[int, int]:
+    F, B = G.size()
+    Bm1 = B - 1
+    GH = torch.stack([G, H], dim=0).cumsum(dim=2)  # [2, F, B]
+    GL, HL = GH[0, :, :-1], GH[1, :, :-1]      # [F, B-1]
+    GP, HP = GH[0, :, -1:], GH[1, :, -1:]          # [F, 1]
+    GR = GP - GL
+    HR = HP - HL
+    # Validity mask using raw child hessians
+    valid = (HL >= min_child_weight) & (HR >= min_child_weight)
+    g = (GR**2)/(HR + lambda_l2) + (GL**2)/(HL + lambda_l2) - (GP**2)/(HP + lambda_l2)
+    gain = torch.where(valid & (g >= min_split_gain), g, -1.0)
+    gain_flat = gain.view(-1)
+    best_idx = torch.argmax(gain_flat)
+    if gain_flat[best_idx].item() == float('-inf'):
+        return -1, -1
+    return best_idx // Bm1, best_idx % Bm1
 class WarpGBM(BaseEstimator, RegressorMixin):
     def __init__(
         self,
@@ -24,6 +56,7 @@ class WarpGBM(BaseEstimator, RegressorMixin):
         threads_per_block=64,
         rows_per_thread=4,
         L2_reg = 1e-6,
+        L1_reg = 0.0,
         device = 'cuda'
     ):
         self.num_bins = num_bins
@@ -54,7 +87,7 @@ class WarpGBM(BaseEstimator, RegressorMixin):
         self.threads_per_block = threads_per_block
         self.rows_per_thread = rows_per_thread
         self.L2_reg = L2_reg
+        self.L1_reg = L1_reg
     def fit(self, X, y, era_id=None):
         if era_id is None:
@@ -68,42 +101,44 @@ class WarpGBM(BaseEstimator, RegressorMixin):
         self.split_gains = torch.zeros((self.num_features, self.num_bins - 1), device=self.device)
         self.forest = self.grow_forest()
         return self
-    def compute_quantile_bins(self, X, num_bins):
-        quantiles = torch.linspace(0, 1, num_bins + 1)[1:-1]  # exclude 0% and 100%
-        bin_edges = torch.quantile(X, quantiles, dim=0)       # shape: [B-1, F]
-        return bin_edges.T  # shape: [F, B-1]
     def preprocess_gpu_data(self, X_np, Y_np, era_id_np):
-        self.num_samples, self.num_features = X_np.shape
-        Y_gpu = torch.from_numpy(Y_np).type(torch.float32).to(self.device)
-        era_id_gpu = torch.from_numpy(era_id_np).type(torch.int32).to(self.device)
-        is_integer_type = np.issubdtype(X_np.dtype, np.integer)
-        if is_integer_type:
-            max_vals = X_np.max(axis=0)
-            if np.all(max_vals < self.num_bins):
-                print("Detected pre-binned integer input — skipping quantile binning.")
-                bin_indices = torch.from_numpy(X_np).to(self.device).contiguous().to(torch.int8)
-                # We'll store None or an empty tensor in self.bin_edges
-                # to indicate that we skip binning at predict-time
-                bin_edges = torch.arange(1, self.num_bins, dtype=torch.float32).repeat(self.num_features, 1)
-                bin_edges = bin_edges.to(self.device)
-                unique_eras, era_indices = torch.unique(era_id_gpu, return_inverse=True)
-                return bin_indices, era_indices, bin_edges, unique_eras, Y_gpu
-            else:
-                print("Integer input detected, but values exceed num_bins — falling back to quantile binning.")
-        print("Performing quantile binning on CPU...")
-        X_cpu = torch.from_numpy(X_np).type(torch.float32)  # CPU tensor
-        bin_edges_cpu = self.compute_quantile_bins(X_cpu, self.num_bins).type(torch.float32).contiguous()
-        bin_indices_cpu = torch.empty((self.num_samples, self.num_features), dtype=torch.int8)
-        for f in range(self.num_features):
-            bin_indices_cpu[:, f] = torch.bucketize(X_cpu[:, f], bin_edges_cpu[f], right=False).type(torch.int8)
-        bin_indices = bin_indices_cpu.to(self.device).contiguous()
-        bin_edges = bin_edges_cpu.to(self.device)
-        unique_eras, era_indices = torch.unique(era_id_gpu, return_inverse=True)
-        return bin_indices, era_indices, bin_edges, unique_eras, Y_gpu
+        with torch.no_grad():
+            self.num_samples, self.num_features = X_np.shape
+            Y_gpu = torch.from_numpy(Y_np).type(torch.float32).to(self.device)
+            era_id_gpu = torch.from_numpy(era_id_np).type(torch.int32).to(self.device)
+            is_integer_type = np.issubdtype(X_np.dtype, np.integer)
+            if is_integer_type:
+                max_vals = X_np.max(axis=0)
+                if np.all(max_vals < self.num_bins):
+                    print("Detected pre-binned integer input — skipping quantile binning.")
+                    bin_indices = torch.from_numpy(X_np).to(self.device).contiguous().to(torch.int8)
+                    # We'll store None or an empty tensor in self.bin_edges
+                    # to indicate that we skip binning at predict-time
+                    bin_edges = torch.arange(1, self.num_bins, dtype=torch.float32).repeat(self.num_features, 1)
+                    bin_edges = bin_edges.to(self.device)
+                    unique_eras, era_indices = torch.unique(era_id_gpu, return_inverse=True)
+                    return bin_indices, era_indices, bin_edges, unique_eras, Y_gpu
+                else:
+                    print("Integer input detected, but values exceed num_bins — falling back to quantile binning.")
+            bin_indices = torch.empty((self.num_samples, self.num_features), dtype=torch.int8, device='cuda')
+            bin_edges = torch.empty((self.num_features, self.num_bins - 1), dtype=torch.float32, device='cuda')
+            X_np = torch.from_numpy(X_np).to(torch.float32).pin_memory()
+            for f in range(self.num_features):
+                X_f = X_np[:, f].to('cuda', non_blocking=True)
+                quantiles = torch.linspace(0, 1, self.num_bins + 1, device='cuda', dtype=X_f.dtype)[1:-1]
+                bin_edges_f = torch.quantile(X_f, quantiles, dim=0).contiguous()  # shape: [B-1] for 1D input
+                bin_indices_f = bin_indices[:, f].contiguous()  # view into output
+                node_kernel.custom_cuda_binner(X_f, bin_edges_f, bin_indices_f)
+                bin_indices[:,f] = bin_indices_f
+                bin_edges[f,:] = bin_edges_f
+            unique_eras, era_indices = torch.unique(era_id_gpu, return_inverse=True)
+            return bin_indices, era_indices, bin_edges, unique_eras, Y_gpu
     def compute_histograms(self, bin_indices_sub, gradients):
         grad_hist = torch.zeros((self.num_features, self.num_bins), device=self.device, dtype=torch.float32)
@@ -121,20 +156,14 @@ class WarpGBM(BaseEstimator, RegressorMixin):
         return grad_hist, hess_hist
     def find_best_split(self, gradient_histogram, hessian_histogram):
-        node_kernel.compute_split(
-            gradient_histogram.contiguous(),
-            hessian_histogram.contiguous(),
-            self.num_features,
-            self.num_bins,
+        f,b = jit_find_best_split(
+            gradient_histogram,
+            hessian_histogram,
+            self.L2_reg,
+            self.L1_reg,
             self.min_split_gain,
             self.min_child_weight,
-            self.L2_reg,
-            self.out_feature,
-            self.out_bin
         )
-        f = int(self.out_feature[0])
-        b = int(self.out_bin[0])
         return (f, b)
     def grow_tree(self, gradient_histogram, hessian_histogram, node_indices, depth):
@@ -179,24 +208,25 @@ class WarpGBM(BaseEstimator, RegressorMixin):
         return { "feature": best_feature, "bin": best_bin, "left": left_child, "right": right_child }
     def grow_forest(self):
-        forest = [{} for _ in range(self.n_estimators)]
-        self.training_loss = []
-        for i in range(self.n_estimators):
-            self.residual = self.Y_gpu - self.gradients
-            self.root_gradient_histogram, self.root_hessian_histogram = \
-                self.compute_histograms(self.bin_indices, self.residual)
-            tree = self.grow_tree(
-                self.root_gradient_histogram,
-                self.root_hessian_histogram,
-                self.root_node_indices,
-                depth=0
-            )
-            forest[i] = tree
-            loss = ((self.Y_gpu - self.gradients) ** 2).mean().item()
-            self.training_loss.append(loss)
+        with torch.no_grad():
+            forest = [{} for _ in range(self.n_estimators)]
+            self.training_loss = []
+            for i in tqdm( range(self.n_estimators) ):
+                self.residual = self.Y_gpu - self.gradients
+                self.root_gradient_histogram, self.root_hessian_histogram = \
+                    self.compute_histograms(self.bin_indices, self.residual)
+                tree = self.grow_tree(
+                    self.root_gradient_histogram,
+                    self.root_hessian_histogram,
+                    self.root_node_indices,
+                    depth=0
+                )
+                forest[i] = tree
+            # loss = ((self.Y_gpu - self.gradients) ** 2).mean().item()
+            # self.training_loss.append(loss)
             # print(f"🌲 Tree {i+1}/{self.n_estimators} - MSE: {loss:.6f}")
         print("Finished training forest.")
@@ -208,103 +238,104 @@ class WarpGBM(BaseEstimator, RegressorMixin):
         We assume `flatten_forest_to_tensors` has produced self.flat_forest with
         "features", "thresholds", "leaf_values", all shaped [n_trees, max_nodes].
         """
-        # 1) Convert X_np -> bin_indices
-        is_integer_type = np.issubdtype(X_np.dtype, np.integer)
-        if is_integer_type:
-            max_vals = X_np.max(axis=0)
-            if np.all(max_vals < self.num_bins):
-                bin_indices = X_np.astype(np.int8)
+        with torch.no_grad():
+            # 1) Convert X_np -> bin_indices
+            is_integer_type = np.issubdtype(X_np.dtype, np.integer)
+            if is_integer_type:
+                max_vals = X_np.max(axis=0)
+                if np.all(max_vals < self.num_bins):
+                    bin_indices = X_np.astype(np.int8)
+                else:
+                    raise ValueError("Pre-binned integers must be < num_bins")
             else:
-                raise ValueError("Pre-binned integers must be < num_bins")
-        else:
-            X_cpu = torch.from_numpy(X_np).type(torch.float32)
-            bin_indices = torch.empty((X_np.shape[0], X_np.shape[1]), dtype=torch.int8)
-            bin_edges_cpu = self.bin_edges.to('cpu')
-            for f in range(self.num_features):
-                bin_indices[:, f] = torch.bucketize(X_cpu[:, f], bin_edges_cpu[f], right=False).type(torch.int8)
-            bin_indices = bin_indices.numpy()
-        # 2) Ensure we have a padded representation
-        self.flat_forest = self.flatten_forest_to_tensors(self.forest)
-        features_t   = self.flat_forest["features"]      # [n_trees, max_nodes], int16
-        thresholds_t = self.flat_forest["thresholds"]    # [n_trees, max_nodes], int16
-        values_t     = self.flat_forest["leaf_values"]    # [n_trees, max_nodes], float32
-        max_nodes    = self.flat_forest["max_nodes"]
-        n_trees = features_t.shape[0]
-        N       = bin_indices.shape[0]
-        out     = np.zeros(N, dtype=np.float32)
-        # 3) Process rows in chunks
-        for start in tqdm(range(0, N, chunk_size)):
-            end = min(start + chunk_size, N)
-            chunk_np  = bin_indices[start:end]  # shape [chunk_size, F]
-            chunk_gpu = torch.from_numpy(chunk_np).to(self.device)  # [chunk_size, F], int8
-            # Accumulate raw (unscaled) leaf sums
-            chunk_preds = torch.zeros((end - start,), dtype=torch.float32, device=self.device)
-            # node_idx[i] tracks the current node index in the padded tree for row i
-            node_idx = torch.zeros((end - start,), dtype=torch.int32, device=self.device)
-            # 'active' is a boolean mask over [0..(end-start-1)], indicating which rows haven't reached a leaf
-            active = torch.ones((end - start,), dtype=torch.bool, device=self.device)
-            for t in range(n_trees):
-                # Reset for each tree (each tree is independent)
-                node_idx.fill_(0)
-                active.fill_(True)
-                tree_features = features_t[t]     # shape [max_nodes], int16
-                tree_thresh   = thresholds_t[t]    # shape [max_nodes], int16
-                tree_values   = values_t[t]          # shape [max_nodes], float32
-                # Up to self.max_depth+1 layers
-                for _level in range(self.max_depth + 1):
-                    active_idx = active.nonzero(as_tuple=True)[0]
-                    if active_idx.numel() == 0:
-                        break  # all rows are done in this tree
-                    current_node_idx = node_idx[active_idx]
-                    f    = tree_features[current_node_idx]    # shape [#active], int16
-                    thr  = tree_thresh[current_node_idx]       # shape [#active], int16
-                    vals = tree_values[current_node_idx]       # shape [#active], float32
-                    mask_no_node = (f == -2)
-                    mask_leaf    = (f == -1)
-                    # If leaf, add leaf value and mark inactive.
-                    if mask_leaf.any():
-                        leaf_rows = active_idx[mask_leaf]
-                        chunk_preds[leaf_rows] += vals[mask_leaf]
-                        active[leaf_rows] = False
-                    # If no node, mark inactive.
-                    if mask_no_node.any():
-                        no_node_rows = active_idx[mask_no_node]
-                        active[no_node_rows] = False
-                    # For internal nodes, perform bin comparison.
-                    mask_internal = (~mask_leaf & ~mask_no_node)
-                    if mask_internal.any():
-                        internal_rows = active_idx[mask_internal]
-                        act_f   = f[mask_internal].long()
-                        act_thr = thr[mask_internal]
-                        binvals = chunk_gpu[internal_rows, act_f]
-                        go_left = (binvals <= act_thr)
-                        new_left_idx  = current_node_idx[mask_internal] * 2 + 1
-                        new_right_idx = current_node_idx[mask_internal] * 2 + 2
-                        node_idx[internal_rows[go_left]]  = new_left_idx[go_left]
-                        node_idx[internal_rows[~go_left]] = new_right_idx[~go_left]
-                # end per-tree layer loop
-            # end for each tree
-            out[start:end] = (
-                self.base_prediction + self.learning_rate * chunk_preds
-            ).cpu().numpy()
-        return out
+                X_cpu = torch.from_numpy(X_np).type(torch.float32)
+                bin_indices = torch.empty((X_np.shape[0], X_np.shape[1]), dtype=torch.int8)
+                bin_edges_cpu = self.bin_edges.to('cpu')
+                for f in range(self.num_features):
+                    bin_indices[:, f] = torch.bucketize(X_cpu[:, f], bin_edges_cpu[f], right=False).type(torch.int8)
+                bin_indices = bin_indices.numpy()
+            # 2) Ensure we have a padded representation
+            self.flat_forest = self.flatten_forest_to_tensors(self.forest)
+            features_t   = self.flat_forest["features"]      # [n_trees, max_nodes], int16
+            thresholds_t = self.flat_forest["thresholds"]    # [n_trees, max_nodes], int16
+            values_t     = self.flat_forest["leaf_values"]    # [n_trees, max_nodes], float32
+            max_nodes    = self.flat_forest["max_nodes"]
+            n_trees = features_t.shape[0]
+            N       = bin_indices.shape[0]
+            out     = np.zeros(N, dtype=np.float32)
+            # 3) Process rows in chunks
+            for start in tqdm(range(0, N, chunk_size)):
+                end = min(start + chunk_size, N)
+                chunk_np  = bin_indices[start:end]  # shape [chunk_size, F]
+                chunk_gpu = torch.from_numpy(chunk_np).to(self.device)  # [chunk_size, F], int8
+                # Accumulate raw (unscaled) leaf sums
+                chunk_preds = torch.zeros((end - start,), dtype=torch.float32, device=self.device)
+                # node_idx[i] tracks the current node index in the padded tree for row i
+                node_idx = torch.zeros((end - start,), dtype=torch.int32, device=self.device)
+                # 'active' is a boolean mask over [0..(end-start-1)], indicating which rows haven't reached a leaf
+                active = torch.ones((end - start,), dtype=torch.bool, device=self.device)
+                for t in range(n_trees):
+                    # Reset for each tree (each tree is independent)
+                    node_idx.fill_(0)
+                    active.fill_(True)
+                    tree_features = features_t[t]     # shape [max_nodes], int16
+                    tree_thresh   = thresholds_t[t]    # shape [max_nodes], int16
+                    tree_values   = values_t[t]          # shape [max_nodes], float32
+                    # Up to self.max_depth+1 layers
+                    for _level in range(self.max_depth + 1):
+                        active_idx = active.nonzero(as_tuple=True)[0]
+                        if active_idx.numel() == 0:
+                            break  # all rows are done in this tree
+                        current_node_idx = node_idx[active_idx]
+                        f    = tree_features[current_node_idx]    # shape [#active], int16
+                        thr  = tree_thresh[current_node_idx]       # shape [#active], int16
+                        vals = tree_values[current_node_idx]       # shape [#active], float32
+                        mask_no_node = (f == -2)
+                        mask_leaf    = (f == -1)
+                        # If leaf, add leaf value and mark inactive.
+                        if mask_leaf.any():
+                            leaf_rows = active_idx[mask_leaf]
+                            chunk_preds[leaf_rows] += vals[mask_leaf]
+                            active[leaf_rows] = False
+                        # If no node, mark inactive.
+                        if mask_no_node.any():
+                            no_node_rows = active_idx[mask_no_node]
+                            active[no_node_rows] = False
+                        # For internal nodes, perform bin comparison.
+                        mask_internal = (~mask_leaf & ~mask_no_node)
+                        if mask_internal.any():
+                            internal_rows = active_idx[mask_internal]
+                            act_f   = f[mask_internal].long()
+                            act_thr = thr[mask_internal]
+                            binvals = chunk_gpu[internal_rows, act_f]
+                            go_left = (binvals <= act_thr)
+                            new_left_idx  = current_node_idx[mask_internal] * 2 + 1
+                            new_right_idx = current_node_idx[mask_internal] * 2 + 2
+                            node_idx[internal_rows[go_left]]  = new_left_idx[go_left]
+                            node_idx[internal_rows[~go_left]] = new_right_idx[~go_left]
+                    # end per-tree layer loop
+                # end for each tree
+                out[start:end] = (
+                    self.base_prediction + self.learning_rate * chunk_preds
+                ).cpu().numpy()
+            return out
     def flatten_forest_to_tensors(self, forest):
         """

warpgbm-0.1.17/warpgbm/cuda/binner.cu ADDED Viewed

@@ -0,0 +1,52 @@
+#include <torch/extension.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+__global__ void bin_column_kernel(
+    const float *__restrict__ X,         // [N]
+    const float *__restrict__ bin_edges, // [B - 1]
+    int8_t *__restrict__ bin_indices,    // [N]
+    int N,
+    int B_minus1)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= N)
+        return;
+    float val = X[idx];
+    int bin = 0;
+    // Linear scan over edges: bin_edges is sorted
+    while (bin < B_minus1 && val >= bin_edges[bin])
+    {
+        ++bin;
+    }
+    bin_indices[idx] = static_cast<int8_t>(bin);
+}
+// C++ launcher for calling from Python
+void launch_bin_column_kernel(
+    at::Tensor X,          // [N]
+    at::Tensor bin_edges,  // [B - 1]
+    at::Tensor bin_indices // [N]
+)
+{
+    const int N = X.size(0);
+    const int B = bin_edges.size(0);
+    const int threads = 256;
+    const int blocks = (N + threads - 1) / threads;
+    bin_column_kernel<<<blocks, threads>>>(
+        X.data_ptr<float>(),
+        bin_edges.data_ptr<float>(),
+        bin_indices.data_ptr<int8_t>(),
+        N,
+        B);
+    // Optional: sync and error check
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess)
+        printf("CUDA error: %s\n", cudaGetErrorString(err));
+}

{warpgbm-0.1.15 → warpgbm-0.1.17}/warpgbm/cuda/node_kernel.cpp RENAMED Viewed

@@ -40,6 +40,11 @@ void launch_histogram_kernel_cuda_configurable(
     int threads_per_block = 256,
     int rows_per_thread = 1);
+void launch_bin_column_kernel(
+    at::Tensor X,
+    at::Tensor bin_edges,
+    at::Tensor bin_indices);
 // Bindings
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
@@ -47,4 +52,5 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
     m.def("compute_histogram2", &launch_histogram_kernel_cuda_2, "Histogram (CUDA) 2");
     m.def("compute_histogram3", &launch_histogram_kernel_cuda_configurable, "Histogram Feature Shared Mem");
     m.def("compute_split", &launch_best_split_kernel_cuda, "Best Split (CUDA)");
+    m.def("custom_cuda_binner", &launch_bin_column_kernel, "Custom CUDA binning kernel");
 }

{warpgbm-0.1.15 → warpgbm-0.1.17/warpgbm.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: warpgbm
-Version: 0.1.15
+Version: 0.1.17
 Summary: A fast GPU-accelerated Gradient Boosted Decision Tree library with PyTorch + CUDA
 License:                     GNU GENERAL PUBLIC LICENSE
                                Version 3, 29 June 2007
@@ -735,6 +735,17 @@ This installs from PyPI and also compiles CUDA code locally during installation.
 > pip install warpgbm --no-build-isolation
 > ```
+### Windows
+Thank you, ShatteredX, for providing working instructions for a Windows installation.
+```
+git clone https://github.com/jefferythewind/warpgbm.git
+cd warpgbm
+python setup.py bdist_wheel
+pip install .\dist\warpgbm-0.1.15-cp310-cp310-win_amd64.whl
+```
 Before either method, make sure you’ve installed PyTorch with GPU support:\
 [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)

{warpgbm-0.1.15 → warpgbm-0.1.17}/warpgbm.egg-info/SOURCES.txt RENAMED Viewed

@@ -15,5 +15,6 @@ warpgbm.egg-info/requires.txt
 warpgbm.egg-info/top_level.txt
 warpgbm/cuda/__init__.py
 warpgbm/cuda/best_split_kernel.cu
+warpgbm/cuda/binner.cu
 warpgbm/cuda/histogram_kernel.cu
 warpgbm/cuda/node_kernel.cpp