PyPI - warpgbm - Versions diffs - 0.1.25__tar.gz → 0.1.27__tar.gz - Mend

warpgbm 0.1.25tar.gz → 0.1.27tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

{warpgbm-0.1.25/warpgbm.egg-info → warpgbm-0.1.27}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: warpgbm
-Version: 0.1.25
+Version: 0.1.27
 Summary: A fast GPU-accelerated Gradient Boosted Decision Tree library with PyTorch + CUDA
 License:                     GNU GENERAL PUBLIC LICENSE
                                Version 3, 29 June 2007
@@ -889,6 +889,7 @@ No installation required — just press **"Open in Playground"**, then **Run All
    y_eval=None,                   # numpy array (float or int) 1 dimension (eval_num_samples)
    eval_every_n_trees=None,       # const (int) >= 1
    early_stopping_rounds=None,    # const (int) >= 1
+   eval_metric='mse'              # string, one of 'mse' or 'corr'. For corr, loss is 1 - correlation(y_true, preds)
 )
 ```
 Train with optional validation set and early stopping.
@@ -922,3 +923,7 @@ WarpGBM builds on the shoulders of PyTorch, scikit-learn, LightGBM, and the CUDA
 ### v0.1.25
 - Added `colsample_bytree` parameter and new test using Numerai data.
+### v0.1.26
+- Fix Memory bugs in prediction and colsample bytree logic. Added "corr" eval metric.

{warpgbm-0.1.25 → warpgbm-0.1.27}/README.md RENAMED Viewed

@@ -201,6 +201,7 @@ No installation required — just press **"Open in Playground"**, then **Run All
    y_eval=None,                   # numpy array (float or int) 1 dimension (eval_num_samples)
    eval_every_n_trees=None,       # const (int) >= 1
    early_stopping_rounds=None,    # const (int) >= 1
+   eval_metric='mse'              # string, one of 'mse' or 'corr'. For corr, loss is 1 - correlation(y_true, preds)
 )
 ```
 Train with optional validation set and early stopping.
@@ -233,4 +234,8 @@ WarpGBM builds on the shoulders of PyTorch, scikit-learn, LightGBM, and the CUDA
 ### v0.1.25
-- Added `colsample_bytree` parameter and new test using Numerai data.
+- Added `colsample_bytree` parameter and new test using Numerai data.
+### v0.1.26
+- Fix Memory bugs in prediction and colsample bytree logic. Added "corr" eval metric.

{warpgbm-0.1.25 → warpgbm-0.1.27}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "warpgbm"
-version = "0.1.25"
+version = "0.1.27"
 description = "A fast GPU-accelerated Gradient Boosted Decision Tree library with PyTorch + CUDA"
 readme = "README.md"
 requires-python = ">=3.8"
@@ -13,5 +13,5 @@ dependencies = [
     "torch",
     "numpy",
     "tqdm",
-    "scikit-learn"
+    "scikit-learn"
 ]

warpgbm-0.1.27/tests/full_numerai_test.py ADDED Viewed

@@ -0,0 +1,67 @@
+from numerapi import NumerAPI
+import pandas as pd
+import numpy as np
+from warpgbm import WarpGBM
+import time
+from sklearn.metrics import mean_squared_error
+def predict_in_chunks(model, X, chunk_size=100_000):
+    preds = []
+    for i in range(0, X.shape[0], chunk_size):
+        X_chunk = X[i : i + chunk_size]
+        preds.append(model.predict(X_chunk))
+    return np.concatenate(preds)
+def test_numerai_data():
+    napi = NumerAPI()
+    napi.download_dataset("v5.0/train.parquet", "numerai_train.parquet")
+    napi.download_dataset("v5.0/validation.parquet", "numerai_validation.parquet")
+    data = pd.concat([
+        pd.read_parquet("numerai_train.parquet"),
+        pd.read_parquet("numerai_validation.parquet")
+    ])
+    features = [f for f in list(data) if "feature" in f]
+    target = "target"
+    data = data.loc[data[ target].isna() == False ]
+    X = data[features].astype("int8").values[:]
+    y = data[target].values
+    model = WarpGBM(
+        max_depth=3,
+        num_bins=5,
+        n_estimators=10,
+        learning_rate=1,
+        threads_per_block=64,
+        rows_per_thread=4,
+        colsample_bytree=0.8,
+    )
+    start_fit = time.time()
+    model.fit(
+        X,
+        y,
+        # era_id=era,
+        # X_eval=X,
+        # y_eval=y,
+        # eval_every_n_trees=10,
+        # early_stopping_rounds=1,
+    )
+    fit_time = time.time() - start_fit
+    print(f"  Fit time:     {fit_time:.3f} seconds")
+    start_pred = time.time()
+    preds = predict_in_chunks(model, X, chunk_size=500_000)
+    pred_time = time.time() - start_pred
+    print(f"  Predict time: {pred_time:.3f} seconds")
+    corr = np.corrcoef(preds, y)[0, 1]
+    mse = mean_squared_error(preds, y)
+    print(f"  Correlation:  {corr:.4f}")
+    print(f"  MSE:  {mse:.4f}")
+    # assert corr > 0.68, f"In-sample correlation too low: {corr}"
+    # assert mse < 0.03, f"In-sample mse too high: {mse}"

warpgbm-0.1.27/tests/test_fit_predict_corr.py ADDED Viewed

@@ -0,0 +1,52 @@
+import numpy as np
+from warpgbm import WarpGBM
+from sklearn.datasets import make_regression
+import time
+from sklearn.metrics import mean_squared_error
+def test_fit_predictpytee_correlation():
+    np.random.seed(42)
+    N = 100_000
+    F = 1000
+    X, y = make_regression(n_samples=N, n_features=F, noise=0.1, random_state=42)
+    era = np.zeros(N, dtype=np.int32)
+    corrs = []
+    mses = []
+    model = WarpGBM(
+        max_depth=10,
+        num_bins=10,
+        n_estimators=100,
+        learning_rate=1,
+        threads_per_block=64,
+        rows_per_thread=4,
+        colsample_bytree=1.0,
+    )
+    start_fit = time.time()
+    model.fit(
+        X,
+        y,
+        era_id=era,
+        X_eval=X,
+        y_eval=y,
+        eval_every_n_trees=10,
+        early_stopping_rounds=1,
+        eval_metric="corr",
+    )
+    fit_time = time.time() - start_fit
+    print(f"  Fit time:     {fit_time:.3f} seconds")
+    start_pred = time.time()
+    preds = model.predict(X)
+    pred_time = time.time() - start_pred
+    print(f"  Predict time: {pred_time:.3f} seconds")
+    corr = np.corrcoef(preds, y)[0, 1]
+    mse = mean_squared_error(preds, y)
+    print(f"  Correlation:  {corr:.4f}")
+    print(f"  MSE:  {mse:.4f}")
+    assert (corr > 0.9), f"In-sample correlation too low: {corrs}"
+    assert (mse < 2), f"In-sample mse too high: {mses}"

warpgbm-0.1.27/version.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.1.27

{warpgbm-0.1.25 → warpgbm-0.1.27}/warpgbm/core.py RENAMED Viewed

@@ -1,19 +1,14 @@
 import torch
 import numpy as np
 from sklearn.base import BaseEstimator, RegressorMixin
+from sklearn.metrics import mean_squared_log_error
 from warpgbm.cuda import node_kernel
+from warpgbm.metrics import rmsle_torch
 from tqdm import tqdm
 from typing import Tuple
 from torch import Tensor
 import gc
-histogram_kernels = {
-    "hist1": node_kernel.compute_histogram,
-    "hist2": node_kernel.compute_histogram2,
-    "hist3": node_kernel.compute_histogram3,
-}
 class WarpGBM(BaseEstimator, RegressorMixin):
     def __init__(
         self,
@@ -23,8 +18,6 @@ class WarpGBM(BaseEstimator, RegressorMixin):
         n_estimators=100,
         min_child_weight=20,
         min_split_gain=0.0,
-        verbosity=True,
-        histogram_computer="hist3",
         threads_per_block=64,
         rows_per_thread=4,
         L2_reg=1e-6,
@@ -40,7 +33,6 @@ class WarpGBM(BaseEstimator, RegressorMixin):
             n_estimators=n_estimators,
             min_child_weight=min_child_weight,
             min_split_gain=min_split_gain,
-            histogram_computer=histogram_computer,
             threads_per_block=threads_per_block,
             rows_per_thread=rows_per_thread,
             L2_reg=L2_reg,
@@ -68,7 +60,6 @@ class WarpGBM(BaseEstimator, RegressorMixin):
         self.min_child_weight = min_child_weight
         self.min_split_gain = min_split_gain
         self.best_bin = torch.tensor([-1], dtype=torch.int32, device=self.device)
-        self.compute_histogram = histogram_kernels[histogram_computer]
         self.threads_per_block = threads_per_block
         self.rows_per_thread = rows_per_thread
         self.L2_reg = L2_reg
@@ -128,17 +119,13 @@ class WarpGBM(BaseEstimator, RegressorMixin):
             )
         if kwargs["L2_reg"] < 0 or kwargs["L1_reg"] < 0:
             raise ValueError("L2_reg and L1_reg must be non-negative.")
-        if kwargs["histogram_computer"] not in histogram_kernels:
-            raise ValueError(
-                f"Invalid histogram_computer: {kwargs['histogram_computer']}. Choose from {list(histogram_kernels.keys())}."
-            )
         if kwargs["colsample_bytree"] <= 0 or kwargs["colsample_bytree"] > 1:
             raise ValueError(
                 f"Invalid colsample_bytree: {kwargs['colsample_bytree']}. Must be a float value > 0 and <= 1."
             )
     def validate_fit_params(
-        self, X, y, era_id, X_eval, y_eval, eval_every_n_trees, early_stopping_rounds
+        self, X, y, era_id, X_eval, y_eval, eval_every_n_trees, early_stopping_rounds, eval_metric
     ):
         # ─── Required: X and y ───
         if not isinstance(X, np.ndarray) or not isinstance(y, np.ndarray):
@@ -206,6 +193,11 @@ class WarpGBM(BaseEstimator, RegressorMixin):
                 # No early stopping = set to "never trigger"
                 early_stopping_rounds = self.n_estimators + 1
+            if eval_metric not in ["mse", "corr", "rmsle"]:
+                raise ValueError(
+                    f"Invalid eval_metric: {eval_metric}. Choose 'mse' or 'corr', 'rmsle'."
+                )
         return early_stopping_rounds  # May have been defaulted here
     def fit(
@@ -217,9 +209,10 @@ class WarpGBM(BaseEstimator, RegressorMixin):
         y_eval=None,
         eval_every_n_trees=None,
         early_stopping_rounds=None,
+        eval_metric = "mse",
     ):
         early_stopping_rounds = self.validate_fit_params(
-            X, y, era_id, X_eval, y_eval, eval_every_n_trees, early_stopping_rounds
+            X, y, era_id, X_eval, y_eval, eval_every_n_trees, early_stopping_rounds, eval_metric
         )
         if era_id is None:
@@ -231,21 +224,24 @@ class WarpGBM(BaseEstimator, RegressorMixin):
         )
         self.num_samples, self.num_features = X.shape
         self.gradients = torch.zeros_like(self.Y_gpu)
-        self.root_node_indices = torch.arange(self.num_samples, device=self.device)
+        self.root_node_indices = torch.arange(self.num_samples, device=self.device, dtype=torch.int32)
         self.base_prediction = self.Y_gpu.mean().item()
         self.gradients += self.base_prediction
-        self.best_gains = torch.zeros(self.num_features, device=self.device)
-        self.best_bins = torch.zeros(
-            self.num_features, device=self.device, dtype=torch.int32
-        )
-        self.feature_indices = torch.arange(self.num_features, device=self.device)
+        if self.colsample_bytree < 1.0:
+            k = max(1, int(self.colsample_bytree * self.num_features))
+        else:
+            k = self.num_features
+        self.best_gains = torch.zeros(k, device=self.device)
+        self.best_bins = torch.zeros(k, device=self.device, dtype=torch.int32)
+        self.feature_indices = torch.arange(self.num_features, device=self.device, dtype=torch.int32)
         # ─── Optional Eval Set ───
         if X_eval is not None and y_eval is not None:
-            self.bin_indices_eval = self.bin_data_with_existing_edges(X_eval)
+            self.bin_indices_eval = self.bin_inference_data(X_eval)
             self.Y_gpu_eval = torch.from_numpy(y_eval).to(torch.float32).to(self.device)
             self.eval_every_n_trees = eval_every_n_trees
             self.early_stopping_rounds = early_stopping_rounds
+            self.eval_metric = eval_metric
         else:
             self.bin_indices_eval = None
             self.Y_gpu_eval = None
@@ -266,50 +262,47 @@ class WarpGBM(BaseEstimator, RegressorMixin):
     def preprocess_gpu_data(self, X_np, Y_np, era_id_np):
         with torch.no_grad():
             self.num_samples, self.num_features = X_np.shape
             Y_gpu = torch.from_numpy(Y_np).type(torch.float32).to(self.device)
-            era_id_gpu = torch.from_numpy(era_id_np).type(torch.int32).to(self.device)
-            is_integer_type = np.issubdtype(X_np.dtype, np.integer)
-            if is_integer_type:
-                max_vals = X_np.max(axis=0)
-                if np.all(max_vals < self.num_bins):
-                    print(
-                        "Detected pre-binned integer input — skipping quantile binning."
-                    )
-                    bin_indices = (
-                        torch.from_numpy(X_np)
-                        .to(self.device)
-                        .contiguous()
-                        .to(torch.int8)
-                    )
-                    # We'll store None or an empty tensor in self.bin_edges
-                    # to indicate that we skip binning at predict-time
-                    bin_edges = torch.arange(
-                        1, self.num_bins, dtype=torch.float32
-                    ).repeat(self.num_features, 1)
-                    bin_edges = bin_edges.to(self.device)
-                    unique_eras, era_indices = torch.unique(
-                        era_id_gpu, return_inverse=True
-                    )
-                    return bin_indices, era_indices, bin_edges, unique_eras, Y_gpu
-                else:
-                    print(
-                        "Integer input detected, but values exceed num_bins — falling back to quantile binning."
-                    )
+            era_id_gpu = torch.from_numpy(era_id_np).type(torch.int32).to(self.device)
             bin_indices = torch.empty(
                 (self.num_samples, self.num_features), dtype=torch.int8, device="cuda"
             )
+            is_integer_type = np.issubdtype(X_np.dtype, np.integer)
+            max_vals = X_np.max(axis=0)
+            if is_integer_type and np.all(max_vals < self.num_bins):
+                print(
+                    "Detected pre-binned integer input — skipping quantile binning."
+                )
+                for f in range(self.num_features):
+                    bin_indices[:,f] = torch.as_tensor( X_np[:, f], device=self.device).contiguous()
+                # bin_indices = X_np.to("cuda", non_blocking=True).contiguous()
+                # We'll store None or an empty tensor in self.bin_edges
+                # to indicate that we skip binning at predict-time
+                bin_edges = torch.arange(
+                    1, self.num_bins, dtype=torch.float32
+                ).repeat(self.num_features, 1)
+                bin_edges = bin_edges.to(self.device)
+                unique_eras, era_indices = torch.unique(
+                    era_id_gpu, return_inverse=True
+                )
+                return bin_indices, era_indices, bin_edges, unique_eras, Y_gpu
+            print("quantile binning.")
             bin_edges = torch.empty(
                 (self.num_features, self.num_bins - 1),
                 dtype=torch.float32,
                 device="cuda",
             )
-            X_np = torch.from_numpy(X_np).to(torch.float32).pin_memory()
             for f in range(self.num_features):
-                X_f = X_np[:, f].to("cuda", non_blocking=True)
+                X_f = torch.as_tensor( X_np[:, f], device=self.device, dtype=torch.float32 ).contiguous()
                 quantiles = torch.linspace(
                     0, 1, self.num_bins + 1, device="cuda", dtype=X_f.dtype
                 )[1:-1]
@@ -324,17 +317,19 @@ class WarpGBM(BaseEstimator, RegressorMixin):
             unique_eras, era_indices = torch.unique(era_id_gpu, return_inverse=True)
             return bin_indices, era_indices, bin_edges, unique_eras, Y_gpu
-    def compute_histograms(self, bin_indices_sub, gradients):
+    def compute_histograms(self, sample_indices, feature_indices):
         grad_hist = torch.zeros(
-            (self.num_features, self.num_bins), device=self.device, dtype=torch.float32
+            (len(feature_indices), self.num_bins), device=self.device, dtype=torch.float32
         )
         hess_hist = torch.zeros(
-            (self.num_features, self.num_bins), device=self.device, dtype=torch.float32
+            (len(feature_indices), self.num_bins), device=self.device, dtype=torch.float32
         )
-        self.compute_histogram(
-            bin_indices_sub,
-            gradients,
+        node_kernel.compute_histogram3(
+            self.bin_indices,
+            self.residual,
+            sample_indices,
+            feature_indices,
             grad_hist,
             hess_hist,
             self.num_bins,
@@ -357,6 +352,9 @@ class WarpGBM(BaseEstimator, RegressorMixin):
         if torch.all(self.best_bins == -1):
             return -1, -1  # No valid split found
+        # print(self.best_bins)
+        # print(self.best_gains)
         f = torch.argmax(self.best_gains).item()
         b = self.best_bins[f].item()
@@ -374,28 +372,38 @@ class WarpGBM(BaseEstimator, RegressorMixin):
             gradient_histogram, hessian_histogram
         )
+        # print(local_feature, best_bin)
         if local_feature == -1:
             leaf_value = self.residual[node_indices].mean()
             self.gradients[node_indices] += self.learning_rate * leaf_value
             return {"leaf_value": leaf_value.item(), "samples": parent_size}
-        split_mask = self.bin_indices_tree[node_indices, local_feature] <= best_bin
+        # print("DEBUG SHAPES -> bin_indices:", self.bin_indices.shape,
+        #     "| node_indices max:", node_indices.max().item(),
+        #     "| local_feature:", local_feature,
+        #     "| feat_indices_tree len:", len(self.feat_indices_tree),
+        #     "| feat index:", self.feat_indices_tree[local_feature])
+        split_mask = self.bin_indices[node_indices, self.feat_indices_tree[local_feature]] <= best_bin
         left_indices = node_indices[split_mask]
         right_indices = node_indices[~split_mask]
+        # print("DEBUG SHAPES -> left_indices:", left_indices.shape,
+        #       "| right_indices:", right_indices.shape,
+        #       "| parent_size:", parent_size,
+        #       "| local_feature:", local_feature,
+        #       "| best_bin:", best_bin)
         left_size = left_indices.numel()
         right_size = right_indices.numel()
         if left_size <= right_size:
-            grad_hist_left, hess_hist_left = self.compute_histograms(
-                self.bin_indices_tree[left_indices], self.residual[left_indices]
-            )
+            grad_hist_left, hess_hist_left = self.compute_histograms( left_indices, self.feat_indices_tree )
             grad_hist_right = gradient_histogram - grad_hist_left
             hess_hist_right = hessian_histogram - hess_hist_left
         else:
-            grad_hist_right, hess_hist_right = self.compute_histograms(
-                self.bin_indices_tree[right_indices], self.residual[right_indices]
-            )
+            grad_hist_right, hess_hist_right = self.compute_histograms( right_indices, self.feat_indices_tree )
             grad_hist_left = gradient_histogram - grad_hist_right
             hess_hist_left = hessian_histogram - hess_hist_right
@@ -413,25 +421,35 @@ class WarpGBM(BaseEstimator, RegressorMixin):
             "left": left_child,
             "right": right_child,
         }
+    def get_eval_metric(self, y_true, y_pred):
+        if self.eval_metric == "mse":
+            return ((y_true - y_pred) ** 2).mean().item()
+        elif self.eval_metric == "corr":
+            return 1 - torch.corrcoef(torch.vstack([y_true, y_pred]))[0, 1].item()
+        elif self.eval_metric == "rmsle":
+            return rmsle_torch(y_true, y_pred).item()
+        else:
+            raise ValueError(f"Invalid eval_metric: {self.eval_metric}.")
     def compute_eval(self, i):
         if self.eval_every_n_trees == None:
             return
+        train_loss = ((self.Y_gpu - self.gradients) ** 2).mean().item()
+        self.training_loss.append(train_loss)
         if i % self.eval_every_n_trees == 0:
             eval_preds = self.predict_binned(self.bin_indices_eval)
-            eval_loss = ((self.Y_gpu_eval - eval_preds) ** 2).mean().item()
+            eval_loss = self.get_eval_metric( self.Y_gpu_eval, eval_preds )
             self.eval_loss.append(eval_loss)
-            train_loss = ((self.Y_gpu - self.gradients) ** 2).mean().item()
-            self.training_loss.append(train_loss)
             if len(self.eval_loss) > self.early_stopping_rounds:
-                if self.eval_loss[-self.early_stopping_rounds] < self.eval_loss[-1]:
+                if self.eval_loss[-(self.early_stopping_rounds+1)] < self.eval_loss[-1]:
                     self.stop = True
             print(
-                f"🌲 Tree {i+1}/{self.n_estimators} | Train MSE: {train_loss:.6f} | Eval MSE: {eval_loss:.6f}"
+                f"🌲 Tree {i+1}/{self.n_estimators} | Train MSE: {train_loss:.6f} | Eval {self.eval_metric}: {eval_loss:.6f}"
             )
             del eval_preds, eval_loss, train_loss
@@ -445,20 +463,14 @@ class WarpGBM(BaseEstimator, RegressorMixin):
             k = max(1, int(self.colsample_bytree * self.num_features))
         else:
             self.feat_indices_tree = self.feature_indices
-            self.bin_indices_tree = self.bin_indices
         for i in range(self.n_estimators):
             self.residual = self.Y_gpu - self.gradients
             if self.colsample_bytree < 1.0:
-                self.feat_indices_tree = torch.randperm(
-                    self.num_features, device=self.device
-                )[:k]
-                self.bin_indices_tree = self.bin_indices[:, self.feat_indices_tree]
+                self.feat_indices_tree = torch.randperm(self.num_features, device=self.device, dtype=torch.int32)[:k]
-            self.root_gradient_histogram, self.root_hessian_histogram = (
-                self.compute_histograms(self.bin_indices_tree, self.residual)
-            )
+            self.root_gradient_histogram, self.root_hessian_histogram = self.compute_histograms( self.root_node_indices, self.feat_indices_tree )
             tree = self.grow_tree(
                 self.root_gradient_histogram,
@@ -476,14 +488,13 @@ class WarpGBM(BaseEstimator, RegressorMixin):
         print("Finished training forest.")
     def bin_data_with_existing_edges(self, X_np):
-        X_tensor = torch.from_numpy(X_np).type(torch.float32).pin_memory()
-        num_samples = X_tensor.size(0)
+        num_samples = X_np.shape[0]
         bin_indices = torch.zeros(
             (num_samples, self.num_features), dtype=torch.int8, device=self.device
         )
         with torch.no_grad():
             for f in range(self.num_features):
-                X_f = X_tensor[:, f].to(self.device, non_blocking=True)
+                X_f = torch.as_tensor( X_np[:, f], device=self.device, dtype=torch.float32 ).contiguous()
                 bin_edges_f = self.bin_edges[f]
                 bin_indices_f = bin_indices[:, f].contiguous()
                 node_kernel.custom_cuda_binner(X_f, bin_edges_f, bin_indices_f)
@@ -493,7 +504,6 @@ class WarpGBM(BaseEstimator, RegressorMixin):
     def predict_binned(self, bin_indices):
         num_samples = bin_indices.size(0)
         tree_tensor = torch.stack(
             [
                 self.flatten_tree(tree, max_nodes=2 ** (self.max_depth + 1))
@@ -508,8 +518,8 @@ class WarpGBM(BaseEstimator, RegressorMixin):
         )
         return out
-    def predict(self, X_np):
+    def bin_inference_data(self, X_np):
         is_integer_type = np.issubdtype(X_np.dtype, np.integer)
         if is_integer_type and X_np.shape[1] == self.num_features:
@@ -523,12 +533,17 @@ class WarpGBM(BaseEstimator, RegressorMixin):
             is_prebinned = False
         if is_prebinned:
-            bin_indices = (
-                torch.from_numpy(X_np).to(self.device).contiguous().to(torch.int8)
+            bin_indices = torch.empty(
+                X_np.shape, dtype=torch.int8, device="cuda"
             )
+            for f in range(self.num_features):
+                bin_indices[:,f] = torch.as_tensor( X_np[:, f], device=self.device).contiguous()
         else:
             bin_indices = self.bin_data_with_existing_edges(X_np)
+        return bin_indices
+    def predict(self, X_np):
+        bin_indices = self.bin_inference_data(X_np)
         preds = self.predict_binned(bin_indices).cpu().numpy()
         del bin_indices
         return preds

warpgbm-0.1.27/warpgbm/cuda/histogram_kernel.cu ADDED Viewed

@@ -0,0 +1,95 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <torch/extension.h>
+__global__ void histogram_tiled_configurable_kernel(
+    const int8_t *__restrict__ bin_indices, // [N, F]
+    const float *__restrict__ residuals,    // [N]
+    const int32_t *__restrict__ sample_indices, // [N]
+    const int32_t *__restrict__ feature_indices, // [F]
+    float *__restrict__ grad_hist,          // [F * B]
+    float *__restrict__ hess_hist,          // [F * B]
+    int64_t N, int64_t F, int64_t B,
+    int rows_per_thread)
+{
+    int hist_feat_idx = blockIdx.x;
+    int feat = feature_indices[ hist_feat_idx ]; // 1 block per feature
+    int row_start = (blockIdx.y * blockDim.x + threadIdx.x) * rows_per_thread;
+    extern __shared__ float shmem[];
+    float *sh_grad = shmem;       // [B]
+    float *sh_hess = &sh_grad[B]; // [B]
+    // Initialize shared memory histograms
+    for (int b = threadIdx.x; b < B; b += blockDim.x)
+    {
+        sh_grad[b] = 0.0f;
+        sh_hess[b] = 0.0f;
+    }
+    __syncthreads();
+    // Each thread processes multiple rows
+    for (int r = 0; r < rows_per_thread; ++r)
+    {
+        int row = row_start + r;
+        if (row < N)
+        {
+            int sample = sample_indices[row];
+            int8_t bin = bin_indices[sample * F + feat];
+            if (bin >= 0 && bin < B)
+            {
+                atomicAdd(&sh_grad[bin], residuals[sample]);
+                atomicAdd(&sh_hess[bin], 1.0f);
+            }
+        }
+    }
+    __syncthreads();
+    // One thread per bin writes results back to global memory
+    for (int b = threadIdx.x; b < B; b += blockDim.x)
+    {
+        int64_t idx = hist_feat_idx * B + b;
+        atomicAdd(&grad_hist[idx], sh_grad[b]);
+        atomicAdd(&hess_hist[idx], sh_hess[b]);
+    }
+}
+void launch_histogram_kernel_cuda_configurable(
+    const at::Tensor &bin_indices,
+    const at::Tensor &residuals,
+    const at::Tensor &sample_indices,
+    const at::Tensor &feature_indices,
+    at::Tensor &grad_hist,
+    at::Tensor &hess_hist,
+    int num_bins,
+    int threads_per_block = 256,
+    int rows_per_thread = 1)
+{
+    int64_t N = sample_indices.size(0);
+    int64_t F = feature_indices.size(0);
+    int num_features_master = bin_indices.size(1);
+    int64_t rows_per_block = threads_per_block * rows_per_thread;
+    int64_t row_tiles = (N + rows_per_block - 1) / rows_per_block;
+    dim3 blocks(F, row_tiles); // grid.x = F, grid.y = row_tiles
+    dim3 threads(threads_per_block);
+    int shared_mem_bytes = 2 * num_bins * sizeof(float);
+    histogram_tiled_configurable_kernel<<<blocks, threads, shared_mem_bytes>>>(
+        bin_indices.data_ptr<int8_t>(),
+        residuals.data_ptr<float>(),
+        sample_indices.data_ptr<int32_t>(),
+        feature_indices.data_ptr<int32_t>(),
+        grad_hist.data_ptr<float>(),
+        hess_hist.data_ptr<float>(),
+        N, num_features_master, num_bins,
+        rows_per_thread);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        printf("CUDA kernel launch failed: %s\n", cudaGetErrorString(err));
+    }
+}

{warpgbm-0.1.25 → warpgbm-0.1.27}/warpgbm/cuda/node_kernel.cpp RENAMED Viewed

@@ -2,23 +2,6 @@
 #include <vector>
 // Declare the function from histogram_kernel.cu
-void launch_histogram_kernel_cuda(
-    const at::Tensor &bin_indices,
-    const at::Tensor &gradients,
-    at::Tensor &grad_hist,
-    at::Tensor &hess_hist,
-    int num_bins,
-    int threads_per_block = 256,
-    int rows_per_thread = 1);
-void launch_histogram_kernel_cuda_2(
-    const at::Tensor &bin_indices, // int8 [N, F]
-    const at::Tensor &gradients,   // float32 [N]
-    at::Tensor &grad_hist,         // float32 [F * B]
-    at::Tensor &hess_hist,         // float32 [F * B]
-    int num_bins,
-    int threads_per_block = 256,
-    int rows_per_thread = 1);
 void launch_best_split_kernel_cuda(
     const at::Tensor &G, // [F x B]
@@ -32,7 +15,9 @@ void launch_best_split_kernel_cuda(
 void launch_histogram_kernel_cuda_configurable(
     const at::Tensor &bin_indices,
-    const at::Tensor &gradients,
+    const at::Tensor &residual,
+    const at::Tensor &sample_indices,
+    const at::Tensor &feature_indices,
     at::Tensor &grad_hist,
     at::Tensor &hess_hist,
     int num_bins,
@@ -54,8 +39,6 @@ void predict_with_forest(
 // Bindings
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
-    m.def("compute_histogram", &launch_histogram_kernel_cuda, "Histogram (CUDA)");
-    m.def("compute_histogram2", &launch_histogram_kernel_cuda_2, "Histogram (CUDA) 2");
     m.def("compute_histogram3", &launch_histogram_kernel_cuda_configurable, "Histogram Feature Shared Mem");
     m.def("compute_split", &launch_best_split_kernel_cuda, "Best Split (CUDA)");
     m.def("custom_cuda_binner", &launch_bin_column_kernel, "Custom CUDA binning kernel");

{warpgbm-0.1.25 → warpgbm-0.1.27}/warpgbm/cuda/predict.cu RENAMED Viewed

@@ -5,23 +5,18 @@
 __global__ void predict_forest_kernel(
     const int8_t *__restrict__ bin_indices, // [N x F]
     const float *__restrict__ tree_tensor,  // [T x max_nodes x 6]
-    int N, int F, int T, int max_nodes,
+    int64_t N, int64_t F, int64_t T, int64_t max_nodes,
     float learning_rate,
     float *__restrict__ out // [N]
 )
 {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int total_jobs = N * T;
+    int64_t idx = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    int64_t total_jobs = N * T;
     if (idx >= total_jobs)
         return;
-    int i = idx % N; // sample index
-    int t = idx / N; // tree index
-    // if (i == 0 && t == 0)
-    // {
-    //     printf("[DEBUG] Thread (i=%d, t=%d): starting prediction\n", i, t);
-    // }
+    int64_t i = idx % N; // sample index
+    int64_t t = idx / N; // tree index
     const float *tree = tree_tensor + t * max_nodes * 6;
@@ -35,32 +30,36 @@ __global__ void predict_forest_kernel(
             atomicAdd(&out[i], learning_rate * val);
             return;
         }
         int feat = static_cast<int>(tree[node_id * 6 + 0]);
         int split_bin = static_cast<int>(tree[node_id * 6 + 1]);
         int left_id = static_cast<int>(tree[node_id * 6 + 2]);
         int right_id = static_cast<int>(tree[node_id * 6 + 3]);
-        int8_t bin = bin_indices[i * F + feat];
+        // prevent overflow
+        int64_t bin_idx = i * F + feat;
+        int8_t bin = bin_indices[bin_idx];
         node_id = (bin <= split_bin) ? left_id : right_id;
-        // printf("sample %d, tree %d, feat %d, bin %d, split %d → %s\n", i, t, feat, bin, split_bin, (bin <= split_bin ? "L" : "R"));
     }
 }
 void predict_with_forest(
-    const at::Tensor &bin_indices, // [N x F], int8
-    const at::Tensor &tree_tensor, // [T x max_nodes x 6], float32
+    const at::Tensor &bin_indices,
+    const at::Tensor &tree_tensor,
     float learning_rate,
-    at::Tensor &out // [N], float32
+    at::Tensor &out
 )
 {
-    int N = bin_indices.size(0);
-    int F = bin_indices.size(1);
-    int T = tree_tensor.size(0);
-    int max_nodes = tree_tensor.size(1);
+    int64_t N = bin_indices.size(0);
+    int64_t F = bin_indices.size(1);
+    int64_t T = tree_tensor.size(0);
+    int64_t max_nodes = tree_tensor.size(1);
-    int total_jobs = N * T;
+    int64_t total_jobs = N * T;
     int threads_per_block = 256;
-    int blocks = (total_jobs + threads_per_block - 1) / threads_per_block;
+    int64_t blocks = (total_jobs + threads_per_block - 1) / threads_per_block;
     predict_forest_kernel<<<blocks, threads_per_block>>>(
         bin_indices.data_ptr<int8_t>(),

warpgbm-0.1.27/warpgbm/metrics.py ADDED Viewed

@@ -0,0 +1,10 @@
+# warpgbm/metrics.py
+import torch
+def rmsle_torch(y_true, y_pred, eps=1e-7):
+    y_true = torch.clamp(y_true, min=0)
+    y_pred = torch.clamp(y_pred, min=0)
+    log_true = torch.log1p(y_true + eps)
+    log_pred = torch.log1p(y_pred + eps)
+    return torch.sqrt(torch.mean((log_true - log_pred) ** 2))

{warpgbm-0.1.25 → warpgbm-0.1.27/warpgbm.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: warpgbm
-Version: 0.1.25
+Version: 0.1.27
 Summary: A fast GPU-accelerated Gradient Boosted Decision Tree library with PyTorch + CUDA
 License:                     GNU GENERAL PUBLIC LICENSE
                                Version 3, 29 June 2007
@@ -889,6 +889,7 @@ No installation required — just press **"Open in Playground"**, then **Run All
    y_eval=None,                   # numpy array (float or int) 1 dimension (eval_num_samples)
    eval_every_n_trees=None,       # const (int) >= 1
    early_stopping_rounds=None,    # const (int) >= 1
+   eval_metric='mse'              # string, one of 'mse' or 'corr'. For corr, loss is 1 - correlation(y_true, preds)
 )
 ```
 Train with optional validation set and early stopping.
@@ -922,3 +923,7 @@ WarpGBM builds on the shoulders of PyTorch, scikit-learn, LightGBM, and the CUDA
 ### v0.1.25
 - Added `colsample_bytree` parameter and new test using Numerai data.
+### v0.1.26
+- Fix Memory bugs in prediction and colsample bytree logic. Added "corr" eval metric.

{warpgbm-0.1.25 → warpgbm-0.1.27}/warpgbm.egg-info/SOURCES.txt RENAMED Viewed

@@ -5,10 +5,12 @@ pyproject.toml
 setup.py
 version.txt
 tests/__init__.py
+tests/full_numerai_test.py
 tests/numerai_test.py
 tests/test_fit_predict_corr.py
 warpgbm/__init__.py
 warpgbm/core.py
+warpgbm/metrics.py
 warpgbm.egg-info/PKG-INFO
 warpgbm.egg-info/SOURCES.txt
 warpgbm.egg-info/dependency_links.txt

warpgbm-0.1.25/tests/test_fit_predict_corr.py DELETED Viewed

@@ -1,57 +0,0 @@
-import numpy as np
-from warpgbm import WarpGBM
-from sklearn.datasets import make_regression
-import time
-from sklearn.metrics import mean_squared_error
-def test_fit_predictpytee_correlation():
-    np.random.seed(42)
-    N = 100_000
-    F = 1000
-    X, y = make_regression(n_samples=N, n_features=F, noise=0.1, random_state=42)
-    era = np.zeros(N, dtype=np.int32)
-    corrs = []
-    mses = []
-    for hist_type in ["hist1", "hist2", "hist3"]:
-        print(f"\nTesting histogram method: {hist_type}")
-        model = WarpGBM(
-            max_depth=10,
-            num_bins=10,
-            n_estimators=100,
-            learning_rate=1,
-            verbosity=False,
-            histogram_computer=hist_type,
-            threads_per_block=64,
-            rows_per_thread=4,
-        )
-        start_fit = time.time()
-        model.fit(
-            X,
-            y,
-            era_id=era,
-            X_eval=X,
-            y_eval=y,
-            eval_every_n_trees=10,
-            early_stopping_rounds=1,
-        )
-        fit_time = time.time() - start_fit
-        print(f"  Fit time:     {fit_time:.3f} seconds")
-        start_pred = time.time()
-        preds = model.predict(X)
-        pred_time = time.time() - start_pred
-        print(f"  Predict time: {pred_time:.3f} seconds")
-        corr = np.corrcoef(preds, y)[0, 1]
-        mse = mean_squared_error(preds, y)
-        print(f"  Correlation:  {corr:.4f}")
-        print(f"  MSE:  {mse:.4f}")
-        corrs.append(corr)
-        mses.append(mse)
-    assert (np.array(corrs) > 0.9).all(), f"In-sample correlation too low: {corrs}"
-    assert (np.array(mses) < 2).all(), f"In-sample mse too high: {mses}"

warpgbm-0.1.25/version.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- 0.1.25

warpgbm-0.1.25/warpgbm/cuda/histogram_kernel.cu DELETED Viewed

@@ -1,250 +0,0 @@
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <torch/extension.h>
-#define F_TILE 128 // Number of features processed per block (tile)
-// Each block processes a tile of features (of size up to F_TILE) and a chunk of samples.
-__global__ void histogram_kernel_shared_sample(
-    const int8_t *__restrict__ bin_indices, // [N, F] bin indices
-    const float *__restrict__ gradients,    // [N] gradient values
-    float *__restrict__ grad_hist,          // [F * B] global gradient histogram (flattened)
-    float *__restrict__ hess_hist,          // [F * B] global hessian histogram (flattened)
-    int64_t N, int64_t F, int64_t B)
-{
-    // Use dynamic shared memory to hold the histogram for a tile.
-    // Allocate 2 arrays: one for gradients and one for hessians.
-    extern __shared__ float shmem[];
-    float *shared_grad = shmem;                // size: tile_features * B floats
-    float *shared_hess = shmem + (F_TILE * B); // same size
-    int tid = threadIdx.x; // Use a 1D block (for sample processing)
-    int block_size = blockDim.x;
-    // Each block is assigned a tile of features:
-    int feature_offset = blockIdx.x * F_TILE;
-    // Adjust tile width if we're near the end of the feature dimension.
-    int tile_features = (feature_offset + F_TILE > F) ? (F - feature_offset) : F_TILE;
-    int tile_size = tile_features * B; // total number of bins in this feature tile
-    // Initialize the tile’s shared memory histograms.
-    for (int i = tid; i < tile_size; i += block_size)
-    {
-        shared_grad[i] = 0.0f;
-        shared_hess[i] = 0.0f;
-    }
-    __syncthreads();
-    // Each block also covers a chunk of samples. Determine the sample index
-    int sample = blockIdx.y * block_size + tid;
-    if (sample < N)
-    {
-        // For each feature in this tile, compute the bin and update shared histograms.
-        for (int j = 0; j < tile_features; j++)
-        {
-            // Global feature index.
-            int f_idx = feature_offset + j;
-            int64_t idx = sample * F + f_idx; // index into the [N, F] bin_indices tensor
-            int8_t b = bin_indices[idx];      // get bin index
-            if (b >= 0 && b < B)
-            {
-                int shared_idx = j * B + b; // index into the tile histogram in shared memory
-                // Using atomics because several threads may update the same bin.
-                atomicAdd(&shared_grad[shared_idx], gradients[sample]);
-                atomicAdd(&shared_hess[shared_idx], 1.0f);
-            }
-        }
-    }
-    __syncthreads();
-    // Flush the per-tile histograms from shared memory to global memory.
-    // Each bin in the tile is added to the global histogram (which is sized [F, B]).
-    for (int i = tid; i < tile_size; i += block_size)
-    {
-        int local_feature = i / B; // feature index relative to the tile
-        int bin = i % B;           // bin index
-        int f_idx = feature_offset + local_feature;
-        if (f_idx < F)
-        {
-            int global_idx = f_idx * B + bin;
-            atomicAdd(&grad_hist[global_idx], shared_grad[i]);
-            atomicAdd(&hess_hist[global_idx], shared_hess[i]);
-        }
-    }
-}
-void launch_histogram_kernel_cuda(
-    const at::Tensor &bin_indices, // [N, F] int8 tensor
-    const at::Tensor &gradients,   // [N] float tensor
-    at::Tensor &grad_hist,         // [F * B] float tensor (preallocated)
-    at::Tensor &hess_hist,         // [F * B] float tensor (preallocated)
-    int num_bins,
-    int threads_per_block = 256,
-    int rows_per_thread = 1)
-{
-    int64_t N = bin_indices.size(0);
-    int64_t F = bin_indices.size(1);
-    int64_t B = num_bins;
-    // Define grid and block dimensions.
-    // blockDim.x: number of threads per block (for processing samples).
-    // gridDim.x: number of feature tiles.
-    int grid_x = (F + F_TILE - 1) / F_TILE;
-    // gridDim.y: number of sample chunks.
-    int grid_y = (N + threads_per_block - 1) / threads_per_block;
-    dim3 blocks(grid_x, grid_y);
-    dim3 threads(threads_per_block);
-    // Calculate shared memory size:
-    // We allocate 2 arrays of size (F_TILE * B) floats (one for grad and one for hess).
-    size_t shared_mem_size = 2 * F_TILE * B * sizeof(float);
-    histogram_kernel_shared_sample<<<blocks, threads, shared_mem_size>>>(
-        bin_indices.data_ptr<int8_t>(),
-        gradients.data_ptr<float>(),
-        grad_hist.data_ptr<float>(),
-        hess_hist.data_ptr<float>(),
-        N, F, B);
-}
-// CUDA kernel: tiled, 64-bit safe
-__global__ void histogram_tiled_kernel(
-    const int8_t *__restrict__ bin_indices, // [N, F]
-    const float *__restrict__ gradients,    // [N]
-    float *__restrict__ grad_hist,          // [F * B]
-    float *__restrict__ hess_hist,          // [F * B]
-    int64_t F, int64_t B, int64_t tile_size)
-{
-    int64_t feature_tiles = (F + tile_size - 1) / tile_size;
-    int64_t row = static_cast<int64_t>(blockIdx.x) / feature_tiles;
-    int64_t tile = static_cast<int64_t>(blockIdx.x) % feature_tiles;
-    int64_t feat = tile * tile_size + threadIdx.x;
-    if (feat >= F)
-        return;
-    int8_t bin = bin_indices[row * F + feat];
-    if (bin >= 0 && bin < B)
-    {
-        int64_t idx = feat * B + bin;
-        atomicAdd(&grad_hist[idx], gradients[row]);
-        atomicAdd(&hess_hist[idx], 1.0f);
-    }
-}
-// Host function exposed to PyTorch
-void launch_histogram_kernel_cuda_2(
-    const at::Tensor &bin_indices, // int8 [N, F]
-    const at::Tensor &gradients,   // float32 [N]
-    at::Tensor &grad_hist,         // float32 [F * B]
-    at::Tensor &hess_hist,         // float32 [F * B]
-    int num_bins,
-    int threads_per_block = 256,
-    int rows_per_thread = 1)
-{
-    int64_t N = bin_indices.size(0);
-    int64_t F = bin_indices.size(1);
-    int64_t tile_size = threads_per_block;
-    int64_t feature_tiles = (F + tile_size - 1) / tile_size;
-    int64_t total_blocks = N * feature_tiles;
-    histogram_tiled_kernel<<<
-        static_cast<int>(total_blocks),
-        static_cast<int>(tile_size)>>>(
-        bin_indices.data_ptr<int8_t>(),
-        gradients.data_ptr<float>(),
-        grad_hist.data_ptr<float>(),
-        hess_hist.data_ptr<float>(),
-        F, num_bins, tile_size);
-    // Optional: check for kernel launch failure
-    cudaError_t err = cudaGetLastError();
-    if (err != cudaSuccess)
-    {
-        printf("CUDA kernel launch failed: %s\n", cudaGetErrorString(err));
-    }
-}
-__global__ void histogram_tiled_configurable_kernel(
-    const int8_t *__restrict__ bin_indices, // [N, F]
-    const float *__restrict__ gradients,    // [N]
-    float *__restrict__ grad_hist,          // [F * B]
-    float *__restrict__ hess_hist,          // [F * B]
-    int64_t N, int64_t F, int64_t B,
-    int rows_per_thread)
-{
-    int feat = blockIdx.x; // 1 block per feature
-    int row_start = (blockIdx.y * blockDim.x + threadIdx.x) * rows_per_thread;
-    extern __shared__ float shmem[];
-    float *sh_grad = shmem;       // [B]
-    float *sh_hess = &sh_grad[B]; // [B]
-    // Initialize shared memory histograms
-    for (int b = threadIdx.x; b < B; b += blockDim.x)
-    {
-        sh_grad[b] = 0.0f;
-        sh_hess[b] = 0.0f;
-    }
-    __syncthreads();
-    // Each thread processes multiple rows
-    for (int r = 0; r < rows_per_thread; ++r)
-    {
-        int row = row_start + r;
-        if (row < N)
-        {
-            int8_t bin = bin_indices[row * F + feat];
-            if (bin >= 0 && bin < B)
-            {
-                atomicAdd(&sh_grad[bin], gradients[row]);
-                atomicAdd(&sh_hess[bin], 1.0f);
-            }
-        }
-    }
-    __syncthreads();
-    // One thread per bin writes results back to global memory
-    for (int b = threadIdx.x; b < B; b += blockDim.x)
-    {
-        int64_t idx = feat * B + b;
-        atomicAdd(&grad_hist[idx], sh_grad[b]);
-        atomicAdd(&hess_hist[idx], sh_hess[b]);
-    }
-}
-void launch_histogram_kernel_cuda_configurable(
-    const at::Tensor &bin_indices,
-    const at::Tensor &gradients,
-    at::Tensor &grad_hist,
-    at::Tensor &hess_hist,
-    int num_bins,
-    int threads_per_block = 256,
-    int rows_per_thread = 1)
-{
-    int64_t N = bin_indices.size(0);
-    int64_t F = bin_indices.size(1);
-    int rows_per_block = threads_per_block * rows_per_thread;
-    int row_tiles = (N + rows_per_block - 1) / rows_per_block;
-    dim3 blocks(F, row_tiles); // grid.x = F, grid.y = row_tiles
-    dim3 threads(threads_per_block);
-    int shared_mem_bytes = 2 * num_bins * sizeof(float);
-    histogram_tiled_configurable_kernel<<<blocks, threads, shared_mem_bytes>>>(
-        bin_indices.data_ptr<int8_t>(),
-        gradients.data_ptr<float>(),
-        grad_hist.data_ptr<float>(),
-        hess_hist.data_ptr<float>(),
-        N, F, num_bins,
-        rows_per_thread);
-    cudaError_t err = cudaGetLastError();
-    if (err != cudaSuccess)
-    {
-        printf("CUDA kernel launch failed: %s\n", cudaGetErrorString(err));
-    }
-}