PyPI - warpgbm - Versions diffs - 0.1.17__tar.gz → 0.1.19__tar.gz - Mend

warpgbm 0.1.17tar.gz → 0.1.19tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

{warpgbm-0.1.17/warpgbm.egg-info → warpgbm-0.1.19}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: warpgbm
-Version: 0.1.17
+Version: 0.1.19
 Summary: A fast GPU-accelerated Gradient Boosted Decision Tree library with PyTorch + CUDA
 License:                     GNU GENERAL PUBLIC LICENSE
                                Version 3, 29 June 2007
@@ -706,7 +706,24 @@ WarpGBM is a high-performance, GPU-accelerated Gradient Boosted Decision Tree (G
 ## Performance Note
-In our initial tests on an NVIDIA 3090 (local) and A100 (Google Colab Pro), WarpGBM achieves **14x to 20x faster training times** compared to LightGBM using default configurations. It also consumes **significantly less RAM and CPU**. These early results hint at more thorough benchmarking to come.
+In our initial tests on an NVIDIA 3090 (local) and A100 (Google Colab Pro), WarpGBM achieves **14x to 20x faster training times** compared to LightGBM's CPU version and **2x faster** on the GPU version using default configurations. Speed also outperforms XGBoost and CatBoost on regression problems. It also consumes **significantly less RAM and CPU**. These early results hint at more thorough benchmarking to come.
+---
+## Benchmarks
+### Scikit-Learn Synthetic Data: 1 Million Rows and 1,000 Features
+In this benchmark we compare the speed and in-sample correlation of **WarpGBM v0.1.19** against LightGBM, XGBoost and CatBoost, all with their GPU-enabled versions. This benchmark runs on Google Colab with the L4 GPU environment. The CPU versions don't even come close to the speed here so we didn't test them.
+```
+   WarpGBM:   corr = 0.8882, train = 21.8s, infer = 11.6s
+   XGBoost:   corr = 0.8877, train = 33.4s, infer = 8.1s
+  LightGBM:   corr = 0.8604, train = 30.2s, infer = 1.4s
+  CatBoost:   corr = 0.8935, train = 377.9s, infer = 375.8s
+```
+Colab Notebook: https://colab.research.google.com/drive/16U1kbYlD5HibGbnF5NGsjChZ1p1IA2pK
 ---

{warpgbm-0.1.17 → warpgbm-0.1.19}/README.md RENAMED Viewed

@@ -18,7 +18,24 @@ WarpGBM is a high-performance, GPU-accelerated Gradient Boosted Decision Tree (G
 ## Performance Note
-In our initial tests on an NVIDIA 3090 (local) and A100 (Google Colab Pro), WarpGBM achieves **14x to 20x faster training times** compared to LightGBM using default configurations. It also consumes **significantly less RAM and CPU**. These early results hint at more thorough benchmarking to come.
+In our initial tests on an NVIDIA 3090 (local) and A100 (Google Colab Pro), WarpGBM achieves **14x to 20x faster training times** compared to LightGBM's CPU version and **2x faster** on the GPU version using default configurations. Speed also outperforms XGBoost and CatBoost on regression problems. It also consumes **significantly less RAM and CPU**. These early results hint at more thorough benchmarking to come.
+---
+## Benchmarks
+### Scikit-Learn Synthetic Data: 1 Million Rows and 1,000 Features
+In this benchmark we compare the speed and in-sample correlation of **WarpGBM v0.1.19** against LightGBM, XGBoost and CatBoost, all with their GPU-enabled versions. This benchmark runs on Google Colab with the L4 GPU environment. The CPU versions don't even come close to the speed here so we didn't test them.
+```
+   WarpGBM:   corr = 0.8882, train = 21.8s, infer = 11.6s
+   XGBoost:   corr = 0.8877, train = 33.4s, infer = 8.1s
+  LightGBM:   corr = 0.8604, train = 30.2s, infer = 1.4s
+  CatBoost:   corr = 0.8935, train = 377.9s, infer = 375.8s
+```
+Colab Notebook: https://colab.research.google.com/drive/16U1kbYlD5HibGbnF5NGsjChZ1p1IA2pK
 ---

{warpgbm-0.1.17 → warpgbm-0.1.19}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "warpgbm"
-version = "0.1.17"
+version = "0.1.19"
 description = "A fast GPU-accelerated Gradient Boosted Decision Tree library with PyTorch + CUDA"
 readme = "README.md"
 requires-python = ">=3.8"

warpgbm-0.1.19/tests/test_fit_predict_corr.py ADDED Viewed

@@ -0,0 +1,46 @@
+import numpy as np
+from warpgbm import WarpGBM
+from sklearn.datasets import make_regression
+import numpy as np
+import time
+from warpgbm import WarpGBM
+from sklearn.datasets import make_regression
+def test_fit_predictpytee_correlation():
+    np.random.seed(42)
+    N = 100_000
+    F = 1000
+    X, y = make_regression(n_samples=N, n_features=F, noise=0.1, random_state=42)
+    era = np.zeros(N, dtype=np.int32)
+    corrs = []
+    for hist_type in ['hist1', 'hist2', 'hist3']:
+        print(f"\nTesting histogram method: {hist_type}")
+        model = WarpGBM(
+            max_depth=10,
+            num_bins=10,
+            n_estimators=10,
+            learning_rate=1,
+            verbosity=False,
+            histogram_computer=hist_type,
+            threads_per_block=64,
+            rows_per_thread=4
+        )
+        start_fit = time.time()
+        model.fit(X, y, era_id=era)
+        fit_time = time.time() - start_fit
+        print(f"  Fit time:     {fit_time:.3f} seconds")
+        start_pred = time.time()
+        preds = model.predict(X)
+        pred_time = time.time() - start_pred
+        print(f"  Predict time: {pred_time:.3f} seconds")
+        corr = np.corrcoef(preds, y)[0, 1]
+        print(f"  Correlation:  {corr:.4f}")
+        corrs.append(corr)
+    assert (np.array(corrs) > 0.95).all(), f"In-sample correlation too low: {corrs}"

warpgbm-0.1.19/version.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.1.19

{warpgbm-0.1.17 → warpgbm-0.1.19}/warpgbm/core.py RENAMED Viewed

@@ -12,36 +12,6 @@ histogram_kernels = {
     'hist3': node_kernel.compute_histogram3
 }
-@torch.jit.script
-def jit_find_best_split(
-    G: Tensor, H: Tensor,
-    lambda_l2: float,
-    lambda_l1: float,  # unused placeholder for now
-    min_split_gain: float,
-    min_child_weight: float
-) -> Tuple[int, int]:
-    F, B = G.size()
-    Bm1 = B - 1
-    GH = torch.stack([G, H], dim=0).cumsum(dim=2)  # [2, F, B]
-    GL, HL = GH[0, :, :-1], GH[1, :, :-1]      # [F, B-1]
-    GP, HP = GH[0, :, -1:], GH[1, :, -1:]          # [F, 1]
-    GR = GP - GL
-    HR = HP - HL
-    # Validity mask using raw child hessians
-    valid = (HL >= min_child_weight) & (HR >= min_child_weight)
-    g = (GR**2)/(HR + lambda_l2) + (GL**2)/(HL + lambda_l2) - (GP**2)/(HP + lambda_l2)
-    gain = torch.where(valid & (g >= min_split_gain), g, -1.0)
-    gain_flat = gain.view(-1)
-    best_idx = torch.argmax(gain_flat)
-    if gain_flat[best_idx].item() == float('-inf'):
-        return -1, -1
-    return best_idx // Bm1, best_idx % Bm1
 class WarpGBM(BaseEstimator, RegressorMixin):
     def __init__(
         self,
@@ -55,16 +25,31 @@ class WarpGBM(BaseEstimator, RegressorMixin):
         histogram_computer='hist3',
         threads_per_block=64,
         rows_per_thread=4,
-        L2_reg = 1e-6,
-        L1_reg = 0.0,
-        device = 'cuda'
+        L2_reg=1e-6,
+        L1_reg=0.0,
+        device='cuda'
     ):
+        # Validate arguments
+        self._validate_hyperparams(
+            num_bins=num_bins,
+            max_depth=max_depth,
+            learning_rate=learning_rate,
+            n_estimators=n_estimators,
+            min_child_weight=min_child_weight,
+            min_split_gain=min_split_gain,
+            histogram_computer=histogram_computer,
+            threads_per_block=threads_per_block,
+            rows_per_thread=rows_per_thread,
+            L2_reg=L2_reg,
+            L1_reg=L1_reg
+        )
         self.num_bins = num_bins
         self.max_depth = max_depth
         self.learning_rate = learning_rate
         self.n_estimators = n_estimators
         self.forest = None
-        self.bin_edges = None  # shape: [num_features, num_bins-1] if using quantile binning
+        self.bin_edges = None
         self.base_prediction = None
         self.unique_eras = None
         self.device = device
@@ -76,12 +61,8 @@ class WarpGBM(BaseEstimator, RegressorMixin):
         self.Y_gpu = None
         self.num_features = None
         self.num_samples = None
-        self.out_feature = torch.zeros(1, device=self.device, dtype=torch.int32)
-        self.out_bin = torch.zeros(1, device=self.device, dtype=torch.int32)
         self.min_child_weight = min_child_weight
         self.min_split_gain = min_split_gain
-        self.best_gain = torch.tensor([-float('inf')], dtype=torch.float32, device=self.device)
-        self.best_feature = torch.tensor([-1], dtype=torch.int32, device=self.device)
         self.best_bin = torch.tensor([-1], dtype=torch.int32, device=self.device)
         self.compute_histogram = histogram_kernels[histogram_computer]
         self.threads_per_block = threads_per_block
@@ -89,6 +70,45 @@ class WarpGBM(BaseEstimator, RegressorMixin):
         self.L2_reg = L2_reg
         self.L1_reg = L1_reg
+    def _validate_hyperparams(self, **kwargs):
+        # Type checks
+        int_params = [
+            "num_bins", "max_depth", "n_estimators", "min_child_weight",
+            "threads_per_block", "rows_per_thread"
+        ]
+        float_params = [
+            "learning_rate", "min_split_gain", "L2_reg", "L1_reg"
+        ]
+        for param in int_params:
+            if not isinstance(kwargs[param], int):
+                raise TypeError(f"{param} must be an integer, got {type(kwargs[param])}.")
+        for param in float_params:
+            if not isinstance(kwargs[param], (float, int)):  # Accept ints as valid floats
+                raise TypeError(f"{param} must be a float, got {type(kwargs[param])}.")
+        if not ( 2 <= kwargs["num_bins"] <= 127 ):
+            raise ValueError("num_bins must be between 2 and 127 inclusive.")
+        if kwargs["max_depth"] < 1:
+            raise ValueError("max_depth must be at least 1.")
+        if not (0.0 < kwargs["learning_rate"] <= 1.0):
+            raise ValueError("learning_rate must be in (0.0, 1.0].")
+        if kwargs["n_estimators"] <= 0:
+            raise ValueError("n_estimators must be positive.")
+        if kwargs["min_child_weight"] < 1:
+            raise ValueError("min_child_weight must be a positive integer.")
+        if kwargs["min_split_gain"] < 0:
+            raise ValueError("min_split_gain must be non-negative.")
+        if kwargs["threads_per_block"] <= 0 or kwargs["threads_per_block"] % 32 != 0:
+            raise ValueError("threads_per_block should be a positive multiple of 32 (warp size).")
+        if not ( 1 <= kwargs["rows_per_thread"] <= 16 ):
+            raise ValueError("rows_per_thread must be positive between 1 and 16 inclusive.")
+        if kwargs["L2_reg"] < 0 or kwargs["L1_reg"] < 0:
+            raise ValueError("L2_reg and L1_reg must be non-negative.")
+        if kwargs["histogram_computer"] not in histogram_kernels:
+            raise ValueError(f"Invalid histogram_computer: {kwargs['histogram_computer']}. Choose from {list(histogram_kernels.keys())}.")
     def fit(self, X, y, era_id=None):
         if era_id is None:
             era_id = np.ones(X.shape[0], dtype='int32')
@@ -98,8 +118,10 @@ class WarpGBM(BaseEstimator, RegressorMixin):
         self.root_node_indices = torch.arange(self.num_samples, device=self.device)
         self.base_prediction = self.Y_gpu.mean().item()
         self.gradients += self.base_prediction
-        self.split_gains = torch.zeros((self.num_features, self.num_bins - 1), device=self.device)
-        self.forest = self.grow_forest()
+        self.best_gains = torch.zeros(self.num_features, device=self.device)
+        self.best_bins = torch.zeros(self.num_features, device=self.device, dtype=torch.int32)
+        with torch.no_grad():
+            self.forest = self.grow_forest()
         return self
     def preprocess_gpu_data(self, X_np, Y_np, era_id_np):
@@ -156,15 +178,24 @@ class WarpGBM(BaseEstimator, RegressorMixin):
         return grad_hist, hess_hist
     def find_best_split(self, gradient_histogram, hessian_histogram):
-        f,b = jit_find_best_split(
+        node_kernel.compute_split(
             gradient_histogram,
             hessian_histogram,
-            self.L2_reg,
-            self.L1_reg,
             self.min_split_gain,
             self.min_child_weight,
+            self.L2_reg,
+            self.best_gains,
+            self.best_bins,
+            self.threads_per_block
         )
-        return (f, b)
+        if torch.all(self.best_bins == -1):
+            return -1, -1  # No valid split found
+        f = torch.argmax(self.best_gains).item()
+        b = self.best_bins[f].item()
+        return f, b
     def grow_tree(self, gradient_histogram, hessian_histogram, node_indices, depth):
         if depth == self.max_depth:
@@ -208,27 +239,26 @@ class WarpGBM(BaseEstimator, RegressorMixin):
         return { "feature": best_feature, "bin": best_bin, "left": left_child, "right": right_child }
     def grow_forest(self):
-        with torch.no_grad():
-            forest = [{} for _ in range(self.n_estimators)]
-            self.training_loss = []
-            for i in tqdm( range(self.n_estimators) ):
-                self.residual = self.Y_gpu - self.gradients
-                self.root_gradient_histogram, self.root_hessian_histogram = \
-                    self.compute_histograms(self.bin_indices, self.residual)
-                tree = self.grow_tree(
-                    self.root_gradient_histogram,
-                    self.root_hessian_histogram,
-                    self.root_node_indices,
-                    depth=0
-                )
-                forest[i] = tree
-            # loss = ((self.Y_gpu - self.gradients) ** 2).mean().item()
-            # self.training_loss.append(loss)
-            # print(f"🌲 Tree {i+1}/{self.n_estimators} - MSE: {loss:.6f}")
+        forest = [{} for _ in range(self.n_estimators)]
+        self.training_loss = []
+        for i in tqdm( range(self.n_estimators) ):
+            self.residual = self.Y_gpu - self.gradients
+            self.root_gradient_histogram, self.root_hessian_histogram = \
+                self.compute_histograms(self.bin_indices, self.residual)
+            tree = self.grow_tree(
+                self.root_gradient_histogram,
+                self.root_hessian_histogram,
+                self.root_node_indices,
+                depth=0
+            )
+            forest[i] = tree
+        # loss = ((self.Y_gpu - self.gradients) ** 2).mean().item()
+        # self.training_loss.append(loss)
+        # print(f"🌲 Tree {i+1}/{self.n_estimators} - MSE: {loss:.6f}")
         print("Finished training forest.")
         return forest

warpgbm-0.1.19/warpgbm/cuda/best_split_kernel.cu ADDED Viewed

@@ -0,0 +1,79 @@
+#include <torch/extension.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+__global__ void best_split_kernel_global_only(
+    const float *__restrict__ G, // [F x B]
+    const float *__restrict__ H, // [F x B]
+    int F,
+    int B,
+    float min_split_gain,
+    float min_child_samples,
+    float eps,
+    float *__restrict__ best_gains, // [F]
+    int *__restrict__ best_bins     // [F]
+)
+{
+    int f = blockIdx.x * blockDim.x + threadIdx.x;
+    if (f >= F)
+        return;
+    float G_total = 0.0f, H_total = 0.0f;
+    for (int b = 0; b < B; ++b)
+    {
+        G_total += G[f * B + b];
+        H_total += H[f * B + b];
+    }
+    float G_L = 0.0f, H_L = 0.0f;
+    float best_gain = min_split_gain;
+    int best_bin = -1;
+    for (int b = 0; b < B - 1; ++b)
+    {
+        G_L += G[f * B + b];
+        H_L += H[f * B + b];
+        float G_R = G_total - G_L;
+        float H_R = H_total - H_L;
+        if (H_L >= min_child_samples && H_R >= min_child_samples)
+        {
+            float gain = (G_L * G_L) / (H_L + eps) + (G_R * G_R) / (H_R + eps);
+            if (gain > best_gain)
+            {
+                best_gain = gain;
+                best_bin = b;
+            }
+        }
+    }
+    best_gains[f] = best_gain;
+    best_bins[f] = best_bin;
+}
+void launch_best_split_kernel_cuda(
+    const at::Tensor &G, // [F x B]
+    const at::Tensor &H, // [F x B]
+    float min_split_gain,
+    float min_child_samples,
+    float eps,
+    at::Tensor &best_gains, // [F], float32
+    at::Tensor &best_bins,  // [F], int32
+    int threads)
+{
+    int F = G.size(0);
+    int B = G.size(1);
+    int blocks = (F + threads - 1) / threads;
+    best_split_kernel_global_only<<<blocks, threads>>>(
+        G.data_ptr<float>(),
+        H.data_ptr<float>(),
+        F,
+        B,
+        min_split_gain,
+        min_child_samples,
+        eps,
+        best_gains.data_ptr<float>(),
+        best_bins.data_ptr<int>());
+}

{warpgbm-0.1.17 → warpgbm-0.1.19}/warpgbm/cuda/node_kernel.cpp RENAMED Viewed

@@ -21,15 +21,14 @@ void launch_histogram_kernel_cuda_2(
     int rows_per_thread = 1);
 void launch_best_split_kernel_cuda(
-    const at::Tensor &G,
-    const at::Tensor &H,
-    int F,
-    int B,
+    const at::Tensor &G, // [F x B]
+    const at::Tensor &H, // [F x B]
     float min_split_gain,
     float min_child_samples,
     float eps,
-    at::Tensor &out_feature,
-    at::Tensor &out_bin);
+    at::Tensor &best_gains, // [F], float32
+    at::Tensor &best_bins,
+    int threads);
 void launch_histogram_kernel_cuda_configurable(
     const at::Tensor &bin_indices,

{warpgbm-0.1.17 → warpgbm-0.1.19/warpgbm.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: warpgbm
-Version: 0.1.17
+Version: 0.1.19
 Summary: A fast GPU-accelerated Gradient Boosted Decision Tree library with PyTorch + CUDA
 License:                     GNU GENERAL PUBLIC LICENSE
                                Version 3, 29 June 2007
@@ -706,7 +706,24 @@ WarpGBM is a high-performance, GPU-accelerated Gradient Boosted Decision Tree (G
 ## Performance Note
-In our initial tests on an NVIDIA 3090 (local) and A100 (Google Colab Pro), WarpGBM achieves **14x to 20x faster training times** compared to LightGBM using default configurations. It also consumes **significantly less RAM and CPU**. These early results hint at more thorough benchmarking to come.
+In our initial tests on an NVIDIA 3090 (local) and A100 (Google Colab Pro), WarpGBM achieves **14x to 20x faster training times** compared to LightGBM's CPU version and **2x faster** on the GPU version using default configurations. Speed also outperforms XGBoost and CatBoost on regression problems. It also consumes **significantly less RAM and CPU**. These early results hint at more thorough benchmarking to come.
+---
+## Benchmarks
+### Scikit-Learn Synthetic Data: 1 Million Rows and 1,000 Features
+In this benchmark we compare the speed and in-sample correlation of **WarpGBM v0.1.19** against LightGBM, XGBoost and CatBoost, all with their GPU-enabled versions. This benchmark runs on Google Colab with the L4 GPU environment. The CPU versions don't even come close to the speed here so we didn't test them.
+```
+   WarpGBM:   corr = 0.8882, train = 21.8s, infer = 11.6s
+   XGBoost:   corr = 0.8877, train = 33.4s, infer = 8.1s
+  LightGBM:   corr = 0.8604, train = 30.2s, infer = 1.4s
+  CatBoost:   corr = 0.8935, train = 377.9s, infer = 375.8s
+```
+Colab Notebook: https://colab.research.google.com/drive/16U1kbYlD5HibGbnF5NGsjChZ1p1IA2pK
 ---

warpgbm-0.1.17/tests/test_fit_predict_corr.py DELETED Viewed

@@ -1,66 +0,0 @@
-import numpy as np
-from warpgbm import WarpGBM
-from sklearn.datasets import make_regression
-def test_fit_predict_correlation():
-    np.random.seed(42)
-    N = 1_000_000
-    F = 100
-    X, y = make_regression(n_samples=N, n_features=F, noise=0.1, random_state=42)
-    era = np.zeros(N, dtype=np.int32)
-    corrs = []
-    model = WarpGBM(
-        max_depth = 10,
-        num_bins = 10,
-        n_estimators = 10,
-        learning_rate = 1,
-        verbosity=False,
-        histogram_computer='hist1',
-        threads_per_block=32,
-        rows_per_thread=4
-    )
-    model.fit(X, y, era_id=era)
-    preds = model.predict(X)
-    # Pearson correlation in-sample
-    corr = np.corrcoef(preds, y)[0, 1]
-    corrs.append(corr)
-    model = WarpGBM(
-        max_depth = 10,
-        num_bins = 10,
-        n_estimators = 10,
-        learning_rate = 1,
-        verbosity=False,
-        histogram_computer='hist2',
-        threads_per_block=32,
-        rows_per_thread=4
-    )
-    model.fit(X, y, era_id=era)
-    preds = model.predict(X)
-    # Pearson correlation in-sample
-    corr = np.corrcoef(preds, y)[0, 1]
-    corrs.append(corr)
-    model = WarpGBM(
-        max_depth = 10,
-        num_bins = 10,
-        n_estimators = 10,
-        learning_rate = 1,
-        verbosity=False,
-        histogram_computer='hist3',
-        threads_per_block=32,
-        rows_per_thread=4
-    )
-    model.fit(X, y, era_id=era)
-    preds = model.predict(X)
-    # Pearson correlation in-sample
-    corr = np.corrcoef(preds, y)[0, 1]
-    corrs.append(corr)
-    assert ( np.array(corrs) > 0.95 ).all(), f"In-sample correlation too low: {corr:.4f}"

warpgbm-0.1.17/version.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- 0.1.17

warpgbm-0.1.17/warpgbm/cuda/best_split_kernel.cu DELETED Viewed

@@ -1,112 +0,0 @@
-#include <torch/extension.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-__global__ void best_split_kernel(
-    const float *__restrict__ G, // [F x B]
-    const float *__restrict__ H, // [F x B]
-    int F,
-    int B,
-    float min_split_gain,
-    float min_child_samples,
-    float eps,
-    int *out_feature,
-    int *out_bin,
-    void *shared_mem)
-{
-    int f = blockIdx.x * blockDim.x + threadIdx.x;
-    if (f >= F)
-        return;
-    // Cast shared memory
-    extern __shared__ char smem[];
-    float *gains = reinterpret_cast<float *>(smem);
-    int *features = reinterpret_cast<int *>(&gains[blockDim.x]);
-    int *bins = reinterpret_cast<int *>(&features[blockDim.x]);
-    // Calculate total G and H for this feature
-    float G_total = 0.0f, H_total = 0.0f;
-    for (int b = 0; b < B; ++b)
-    {
-        G_total += G[f * B + b];
-        H_total += H[f * B + b];
-    }
-    float G_L = 0.0f, H_L = 0.0f;
-    float best_gain = min_split_gain;
-    int best_bin = -1;
-    for (int b = 0; b < B - 1; ++b)
-    {
-        G_L += G[f * B + b];
-        H_L += H[f * B + b];
-        float G_R = G_total - G_L;
-        float H_R = H_total - H_L;
-        if (H_L > min_child_samples && H_R > min_child_samples)
-        {
-            float gain = (G_L * G_L) / (H_L + eps) + (G_R * G_R) / (H_R + eps);
-            if (gain > best_gain)
-            {
-                best_gain = gain;
-                best_bin = b;
-            }
-        }
-    }
-    gains[threadIdx.x] = best_gain;
-    features[threadIdx.x] = f;
-    bins[threadIdx.x] = best_bin;
-    __syncthreads();
-    // Thread 0 in each block finds best among its block
-    if (threadIdx.x == 0)
-    {
-        float block_best_gain = min_split_gain;
-        int block_best_feature = -1;
-        int block_best_bin = -1;
-        for (int i = 0; i < blockDim.x && blockIdx.x * blockDim.x + i < F; ++i)
-        {
-            if (gains[i] > block_best_gain)
-            {
-                block_best_gain = gains[i];
-                block_best_feature = features[i];
-                block_best_bin = bins[i];
-            }
-        }
-        // Write to global outputs
-        *out_feature = block_best_feature;
-        *out_bin = block_best_bin;
-    }
-}
-void launch_best_split_kernel_cuda(
-    const at::Tensor &G,
-    const at::Tensor &H,
-    int F,
-    int B,
-    float min_split_gain,
-    float min_child_samples,
-    float eps,
-    at::Tensor &out_feature,
-    at::Tensor &out_bin)
-{
-    int threads = 256;
-    int blocks = (F + threads - 1) / threads;
-    size_t shared_mem_bytes = threads * (sizeof(float) + 2 * sizeof(int));
-    best_split_kernel<<<blocks, threads, shared_mem_bytes>>>(
-        G.data_ptr<float>(),
-        H.data_ptr<float>(),
-        F,
-        B,
-        min_split_gain,
-        min_child_samples,
-        eps,
-        out_feature.data_ptr<int>(),
-        out_bin.data_ptr<int>(),
-        nullptr // shared memory pointer not needed; just launch size
-    );
-}