PyPI - autogluon.tabular - Versions diffs - 1.3.2b20250610__py3-none-any.whl → 1.4.1b20251214__py3-none-any.whl - Mend

autogluon.tabular 1.3.2b20250610py3-none-any.whl → 1.4.1b20251214py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (92) hide show

autogluon/tabular/models/ebm/ebm_model.py ADDED Viewed

@@ -0,0 +1,259 @@
+from __future__ import annotations
+import time
+import warnings
+from typing import TYPE_CHECKING
+import numpy as np
+import pandas as pd
+from autogluon.core.constants import BINARY, MULTICLASS, REGRESSION
+from autogluon.core.models import AbstractModel
+from .hyperparameters.parameters import get_param_baseline
+from .hyperparameters.searchspaces import get_default_searchspace
+if TYPE_CHECKING:
+    from autogluon.core.metrics import Scorer
+class EbmCallback:
+    """Time limit callback for EBM."""
+    def __init__(self, seconds: float):
+        self.seconds = seconds
+        self.end_time: float | None = None
+    def __call__(self, *args, **kwargs):
+        if self.end_time is None:
+            self.end_time = time.monotonic() + self.seconds
+            return False
+        return time.monotonic() > self.end_time
+class EBMModel(AbstractModel):
+    """
+    The Explainable Boosting Machine (EBM) is a glass-box generalized additive model
+    with automatic interaction detection (https://interpret.ml/docs). EBMs are
+    designed to be highly interpretable while achieving accuracy comparable to
+    black-box models on a wide range of tabular datasets.
+    Requires the 'interpret' or 'interpret-core' package. Install via:
+    pip install interpret
+    Paper: InterpretML: A Unified Framework for Machine Learning Interpretability
+    Authors: H. Nori, S. Jenkins, P. Koch, and R. Caruana 2019
+    Codebase: https://github.com/interpretml/interpret
+    License: MIT
+    .. versionadded:: 1.5.0
+    """
+    ag_key = "EBM"
+    ag_name = "EBM"
+    ag_priority = 35
+    seed_name = "random_state"
+    def _fit(
+        self,
+        X: pd.DataFrame,
+        y: pd.Series,
+        X_val: pd.DataFrame | None = None,
+        y_val: pd.Series | None = None,
+        time_limit: float | None = None,
+        sample_weight: np.ndarray | None = None,
+        sample_weight_val: np.ndarray | None = None,
+        num_cpus: int | str = "auto",
+        **kwargs,
+    ):
+        # Preprocess data.
+        X = self.preprocess(X)
+        if X_val is not None:
+            X_val = self.preprocess(X_val)
+        features = self._features
+        if features is None:
+            features = X.columns
+        params = construct_ebm_params(
+            self.problem_type,
+            self._get_model_params(),
+            features,
+            self.stopping_metric,
+            num_cpus,
+            time_limit,
+        )
+        # Init Class
+        model_cls = get_class_from_problem_type(self.problem_type)
+        self.model = model_cls(**params)
+        # Handle validation data format for EBM
+        fit_X = X
+        fit_y = y
+        fit_sample_weight = sample_weight
+        bags = None
+        if X_val is not None:
+            fit_X = pd.concat([X, X_val], ignore_index=True)
+            fit_y = pd.concat([y, y_val], ignore_index=True)
+            if sample_weight is not None:
+                fit_sample_weight = np.hstack([sample_weight, sample_weight_val])
+            bags = np.full((len(fit_X), 1), 1, np.int8)
+            bags[len(X) :, 0] = -1
+        with warnings.catch_warnings():  # try to filter joblib warnings
+            warnings.filterwarnings(
+                "ignore",
+                category=UserWarning,
+                message=".*resource_tracker: process died.*",
+            )
+            self.model.fit(fit_X, fit_y, sample_weight=fit_sample_weight, bags=bags)
+    def _set_default_params(self):
+        default_params = get_param_baseline(problem_type=self.problem_type, num_classes=self.num_classes)
+        for param, val in default_params.items():
+            self._set_default_param_value(param, val)
+    def _get_default_searchspace(self):
+        return get_default_searchspace(problem_type=self.problem_type, num_classes=self.num_classes)
+    def _get_default_auxiliary_params(self) -> dict:
+        default_auxiliary_params = super()._get_default_auxiliary_params()
+        extra_auxiliary_params = {
+            "valid_raw_types": ["int", "float", "category"],
+        }
+        default_auxiliary_params.update(extra_auxiliary_params)
+        return default_auxiliary_params
+    @classmethod
+    def supported_problem_types(cls) -> list[str] | None:
+        return ["binary", "multiclass", "regression"]
+    @classmethod
+    def _class_tags(cls) -> dict:
+        return {"can_estimate_memory_usage_static": True}
+    def _more_tags(self) -> dict:
+        """EBMs support refit full."""
+        return {"can_refit_full": True}
+    def _estimate_memory_usage(self, X: pd.DataFrame, y: pd.Series | None = None, **kwargs) -> int:
+        return self.estimate_memory_usage_static(
+            X=X,
+            y=y,
+            hyperparameters=self._get_model_params(),
+            problem_type=self.problem_type,
+            num_classes=self.num_classes,
+            features=self._features,
+            **kwargs,
+        )
+    @classmethod
+    def _estimate_memory_usage_static(
+        cls,
+        *,
+        X: pd.DataFrame,
+        y: pd.Series | None = None,
+        hyperparameters: dict | None = None,
+        problem_type: str = "infer",
+        num_classes: int = 1,
+        features=None,
+        **kwargs,
+    ) -> int:
+        """Returns the expected peak memory usage in bytes of the EBM model during fit."""
+        # TODO: we can improve the memory estimate slightly by using num_classes if y is None
+        if features is None:
+            features = X.columns
+        model_cls = get_class_from_problem_type(problem_type)
+        params = construct_ebm_params(problem_type, hyperparameters, features)
+        baseline_memory_bytes = 400_000_000  # 400 MB baseline memory
+        # assuming we call pd.concat([X, X_val], ignore_index=True), then X size will be doubled
+        return baseline_memory_bytes + model_cls(**params).estimate_mem(
+            X, y, data_multiplier=2.0
+        )
+    def _validate_fit_memory_usage(self, mem_error_threshold: float = 1, **kwargs):
+        # Given the good mem estimates with overhead, we set the threshold to 1.
+        return super()._validate_fit_memory_usage(
+            mem_error_threshold=mem_error_threshold, **kwargs
+        )
+def construct_ebm_params(
+    problem_type,
+    hyperparameters=None,
+    features=None,
+    stopping_metric=None,
+    num_cpus=-1,
+    time_limit=None,
+):
+    if hyperparameters is None:
+        hyperparameters = {}
+    hyperparameters = hyperparameters.copy()  # we pop values below, so copy.
+    # The user can specify nominal and continuous columns.
+    continuous_columns = hyperparameters.pop("continuous_columns", [])
+    nominal_columns = hyperparameters.pop("nominal_columns", [])
+    feature_types = None
+    if features is not None:
+        feature_types = []
+        for c in features:
+            if c in continuous_columns:
+                f_type = "continuous"
+            elif c in nominal_columns:
+                f_type = "nominal"
+            else:
+                f_type = "auto"
+            feature_types.append(f_type)
+    # Default parameters for EBM
+    params = {
+        "outer_bags": 1,  # AutoGluon ensemble creates outer bags, no need for this overhead.
+        "n_jobs": 1,  # EBM only parallelizes across outer bags currently, so ignore num_cpus
+        "feature_names": features,
+        "feature_types": feature_types,
+    }
+    if stopping_metric is not None:
+        params["objective"] = get_metric_from_ag_metric(
+            metric=stopping_metric, problem_type=problem_type
+        )
+    if time_limit is not None:
+        params["callback"] = EbmCallback(time_limit)
+    params.update(hyperparameters)
+    return params
+def get_class_from_problem_type(problem_type: str):
+    if problem_type in [BINARY, MULTICLASS]:
+        from interpret.glassbox import ExplainableBoostingClassifier
+        model_cls = ExplainableBoostingClassifier
+    elif problem_type == REGRESSION:
+        from interpret.glassbox import ExplainableBoostingRegressor
+        model_cls = ExplainableBoostingRegressor
+    else:
+        raise ValueError(f"Unsupported problem type: {problem_type}")
+    return model_cls
+def get_metric_from_ag_metric(*, metric: Scorer, problem_type: str):
+    """Map AutoGluon metric to EBM metric for early stopping."""
+    if problem_type in [BINARY, MULTICLASS]:
+        metric_class = "log_loss"
+    elif problem_type == REGRESSION:
+        metric_class = "rmse"
+    else:
+        raise AssertionError(f"EBM does not support {problem_type} problem type.")
+    return metric_class

autogluon/tabular/models/ebm/hyperparameters/__init__.py ADDED Viewed

File without changes

autogluon/tabular/models/ebm/hyperparameters/parameters.py ADDED Viewed

@@ -0,0 +1,39 @@
+from autogluon.core.constants import BINARY, MULTICLASS, REGRESSION, SOFTCLASS
+def get_param_baseline(problem_type, num_classes=None):
+    if problem_type == BINARY:
+        return get_param_binary_baseline()
+    elif problem_type == MULTICLASS:
+        return get_param_multiclass_baseline(num_classes=num_classes)
+    elif problem_type == SOFTCLASS:
+        return get_param_multiclass_baseline(num_classes=num_classes)
+    elif problem_type == REGRESSION:
+        return get_param_regression_baseline()
+    else:
+        return get_param_binary_baseline()
+def get_base_params():
+    base_params = {}
+    return base_params
+def get_param_binary_baseline():
+    params = get_base_params()
+    baseline_params = {}
+    params.update(baseline_params)
+    return params
+def get_param_multiclass_baseline(num_classes):
+    params = get_base_params()
+    baseline_params = {}
+    params.update(baseline_params)
+    return params
+def get_param_regression_baseline():
+    params = get_base_params()
+    baseline_params = {}
+    params.update(baseline_params)
+    return params

autogluon/tabular/models/ebm/hyperparameters/searchspaces.py ADDED Viewed

@@ -0,0 +1,72 @@
+"""Default hyperparameter search spaces used in EBM model"""
+from autogluon.common import space
+from autogluon.core.constants import BINARY, MULTICLASS, REGRESSION
+def get_default_searchspace(problem_type, num_classes=None):
+    if problem_type == BINARY:
+        return get_searchspace_binary_baseline()
+    elif problem_type == MULTICLASS:
+        return get_searchspace_multiclass_baseline(num_classes=num_classes)
+    elif problem_type == REGRESSION:
+        return get_searchspace_regression_baseline()
+    else:
+        return get_searchspace_binary_baseline()
+def get_base_searchspace():
+    base_params = {
+        "max_leaves": space.Int(2, 3, default=2),
+        "smoothing_rounds": space.Int(0, 1000, default=200),
+        "learning_rate": space.Real(0.0025, 0.2, default=0.02, log=True),
+        "interactions": space.Categorical(
+            0,
+            "0.5x",
+            "1x",
+            "1.5x",
+            "2x",
+            "2.5x",
+            "3x",
+            "3.5x",
+            "4x",
+            "4.5x",
+            "5x",
+            "6x",
+            "7x",
+            "8x",
+            "9x",
+            "10x",
+            "15x",
+            "20x",
+            "25x",
+        ),
+        "interaction_smoothing_rounds": space.Int(0, 200, default=90),
+        "min_hessian": space.Real(1e-10, 1e-2, default=1e-4, log=True),
+        "min_samples_leaf": space.Int(2, 20, default=4),
+        "gain_scale": space.Real(0.5, 5.0, default=5.0, log=True),
+        "min_cat_samples": space.Int(5, 20, default=10),
+        "cat_smooth": space.Real(5.0, 100.0, default=10.0, log=True),
+        "missing": space.Categorical("separate", "low", "high", "gain"),
+    }
+    return base_params
+def get_searchspace_multiclass_baseline(num_classes):
+    params = get_base_searchspace()
+    baseline_params = {}
+    params.update(baseline_params)
+    return params
+def get_searchspace_binary_baseline():
+    params = get_base_searchspace()
+    baseline_params = {}
+    params.update(baseline_params)
+    return params
+def get_searchspace_regression_baseline():
+    params = get_base_searchspace()
+    baseline_params = {}
+    params.update(baseline_params)
+    return params

autogluon/tabular/models/fastainn/tabular_nn_fastai.py CHANGED Viewed

@@ -103,6 +103,7 @@ class NNFastAiTabularModel(AbstractModel):
     ag_priority_by_problem_type = MappingProxyType({
         MULTICLASS: 95,
     })
+    seed_name = "random_seed"
     model_internals_file_name = "model-internals.pkl"
@@ -322,8 +323,9 @@ class NNFastAiTabularModel(AbstractModel):
         # Make deterministic
         from fastai.torch_core import set_seed
-        set_seed(0, True)
-        dls.rng.seed(0)
+        random_seed = params.pop(self.seed_name, self.default_random_seed)
+        set_seed(random_seed, True)
+        dls.rng.seed(random_seed)
         if self.problem_type == QUANTILE:
             dls.c = len(self.quantile_levels)
@@ -584,8 +586,8 @@ class NNFastAiTabularModel(AbstractModel):
         return default_auxiliary_params
     def _get_default_resources(self):
-        # logical=False is faster in training
-        num_cpus = ResourceManager.get_cpu_count_psutil(logical=False)
+        # only_physical_cores=True is faster in training
+        num_cpus = ResourceManager.get_cpu_count(only_physical_cores=True)
         num_gpus = 0
         return num_cpus, num_gpus
@@ -642,7 +644,7 @@ class NNFastAiTabularModel(AbstractModel):
     def _get_maximum_resources(self) -> dict[str, Union[int, float]]:
         # fastai model trains slower when utilizing virtual cores and this issue scale up when the number of cpu cores increases
-        return {"num_cpus": ResourceManager.get_cpu_count_psutil(logical=False)}
+        return {"num_cpus": ResourceManager.get_cpu_count(only_physical_cores=True)}
     def get_minimum_resources(self, is_gpu_available=False):
         minimum_resources = {

autogluon/tabular/models/knn/knn_model.py CHANGED Viewed

@@ -214,7 +214,7 @@ class KNNModel(AbstractModel):
         def sample_func(chunk, frac):
             # Guarantee at least 1 sample (otherwise log_loss would crash or model would return different column counts in pred_proba)
             n = max(math.ceil(len(chunk) * frac), 1)
-            return chunk.sample(n=n, replace=False, random_state=0)
+            return chunk.sample(n=n, replace=False, random_state=self.random_seed)
         if self.problem_type != REGRESSION:
             y_df = y.to_frame(name="label").reset_index(drop=True)
@@ -255,9 +255,13 @@ class KNNModel(AbstractModel):
             self._X_unused_index = [i for i in range(num_rows_max) if i not in idx]
         return self.model
-    def _get_maximum_resources(self) -> Dict[str, Union[int, float]]:
+    def _get_maximum_resources(self) -> dict[str, int | float]:
         # use at most 32 cpus to avoid OpenBLAS error: https://github.com/autogluon/autogluon/issues/1020
-        return {"num_cpus": 32}
+        # no GPU support
+        return {
+            "num_cpus": 32,
+            "num_gpus": 0,
+        }
     def _get_default_resources(self):
         # use at most 32 cpus to avoid OpenBLAS error: https://github.com/autogluon/autogluon/issues/1020

autogluon/tabular/models/lgb/lgb_model.py CHANGED Viewed

@@ -46,6 +46,8 @@ class LGBModel(AbstractModel):
     ag_priority_by_problem_type = MappingProxyType({
         SOFTCLASS: 100
     })
+    seed_name = "seed"
+    seed_name_alt = ["seed_value", "random_seed", "random_state"]
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
@@ -161,7 +163,7 @@ class LGBModel(AbstractModel):
                 #  Before enabling GPU, we should add code to detect that GPU-enabled version is installed and that a valid GPU exists.
                 #  GPU training heavily alters accuracy, often in a negative manner. We will have to be careful about when to use GPU.
                 params["device"] = "gpu"
-                logger.log(20, f"\tTraining {self.name} with GPU, note that this may negatively impact model quality compared to CPU training.")
+                logger.log(20, f"\tWarning: Training LightGBM with GPU. This may negatively impact model quality compared to CPU training.")
         logger.log(15, f"\tFitting {num_boost_round} rounds... Hyperparameters: {params}")
         if "num_threads" not in params:
@@ -225,7 +227,6 @@ class LGBModel(AbstractModel):
         if log_period is not None:
             callbacks.append(log_evaluation(period=log_period))
-        seed_val = params.pop("seed_value", 0)
         train_params = {
             "params": params,
             "train_set": dataset_train,
@@ -281,11 +282,10 @@ class LGBModel(AbstractModel):
                 train_params["params"]["metric"] = f'{stopping_metric},{train_params["params"]["metric"]}'
         if self.problem_type == SOFTCLASS:
-            train_params["fobj"] = lgb_utils.softclass_lgbobj
+            train_params["params"]["objective"] = lgb_utils.softclass_lgbobj
+            train_params["params"]["num_classes"] = self.num_classes
         elif self.problem_type == QUANTILE:
             train_params["params"]["quantile_levels"] = self.quantile_levels
-        if seed_val is not None:
-            train_params["params"]["seed"] = seed_val
         # Train LightGBM model:
         # Note that self.model contains a <class 'lightgbm.basic.Booster'> not a LightBGMClassifier or LightGBMRegressor object
@@ -298,16 +298,28 @@ class LGBModel(AbstractModel):
             try:
                 self.model = train_lgb_model(early_stopping_callback_kwargs=early_stopping_callback_kwargs, **train_params)
             except LightGBMError:
-                if train_params["params"].get("device", "cpu") != "gpu":
+                if train_params["params"].get("device", "cpu") not in ["gpu", "cuda"]:
                     raise
                 else:
-                    logger.warning(
-                        "Warning: GPU mode might not be installed for LightGBM, GPU training raised an exception. Falling back to CPU training..."
-                        "Refer to LightGBM GPU documentation: https://github.com/Microsoft/LightGBM/tree/master/python-package#build-gpu-version"
-                        "One possible method is:"
-                        "\tpip uninstall lightgbm -y"
-                        "\tpip install lightgbm --install-option=--gpu"
-                    )
+                    if train_params["params"]["device"] == "gpu":
+                        logger.warning(
+                            "Warning: GPU mode might not be installed for LightGBM, "
+                            "GPU training raised an exception. Falling back to CPU training..."
+                            "Refer to LightGBM GPU documentation: "
+                            "https://github.com/Microsoft/LightGBM/tree/master/python-package#build-gpu-version"
+                            "One possible method is:"
+                            "\tpip uninstall lightgbm -y"
+                            "\tpip install lightgbm --install-option=--gpu"
+                        )
+                    elif train_params["params"]["device"] == "cuda":
+                        # Current blocker for using CUDA over GPU: https://github.com/microsoft/LightGBM/issues/6828
+                        # Note that device="cuda" works if AutoGluon (and therefore LightGBM) is installed via conda.
+                        logger.warning(
+                            "Warning: CUDA mode might not be installed for LightGBM, "
+                            "CUDA training raised an exception. Falling back to CPU training..."
+                            "Refer to LightGBM CUDA documentation: "
+                            "https://github.com/Microsoft/LightGBM/tree/master/python-package#build-cuda-version"
+                        )
                     train_params["params"]["device"] = "cpu"
                     self.model = train_lgb_model(early_stopping_callback_kwargs=early_stopping_callback_kwargs, **train_params)
             retrain = False
@@ -508,17 +520,44 @@ class LGBModel(AbstractModel):
         default_auxiliary_params.update(extra_auxiliary_params)
         return default_auxiliary_params
-    def _is_gpu_lgbm_installed(self):
+    @staticmethod
+    def _is_gpu_lgbm_installed():
         # Taken from https://github.com/microsoft/LightGBM/issues/3939
         try_import_lightgbm()
         import lightgbm
+        rng = np.random.RandomState(42)
+        data = rng.rand(25, 2)
+        label = rng.randint(2, size=25)
+        try:
+            train_data = lightgbm.Dataset(data, label=label)
+            params = {
+                "device": "gpu",
+                "verbose": -1,
+            }
+            gbm = lightgbm.train(params, num_boost_round=10, train_set=train_data)
+            return True
+        except Exception as e:
+            return False
+    @staticmethod
+    def _is_cuda_lgbm_installed():
+        # Taken from https://github.com/microsoft/LightGBM/issues/3939
+        try_import_lightgbm()
+        import lightgbm
+        rng = np.random.RandomState(42)
+        data = rng.rand(25, 2)
+        label = rng.randint(2, size=25)
         try:
-            data = np.random.rand(50, 2)
-            label = np.random.randint(2, size=50)
             train_data = lightgbm.Dataset(data, label=label)
-            params = {"device": "gpu"}
-            gbm = lightgbm.train(params, train_set=train_data, verbose=-1)
+            params = {
+                "device": "cuda",
+                "verbose": -1,
+            }
+            gbm = lightgbm.train(params, num_boost_round=10, train_set=train_data)
             return True
         except Exception as e:
             return False
@@ -527,13 +566,13 @@ class LGBModel(AbstractModel):
         minimum_resources = {
             "num_cpus": 1,
         }
-        if is_gpu_available and self._is_gpu_lgbm_installed():
+        if is_gpu_available:
             minimum_resources["num_gpus"] = 0.5
         return minimum_resources
     def _get_default_resources(self):
-        # logical=False is faster in training
-        num_cpus = ResourceManager.get_cpu_count_psutil(logical=False)
+        # only_physical_cores=True is faster in training
+        num_cpus = ResourceManager.get_cpu_count(only_physical_cores=True)
         num_gpus = 0
         return num_cpus, num_gpus

autogluon/tabular/models/lr/lr_model.py CHANGED Viewed

@@ -43,6 +43,7 @@ class LinearModel(AbstractModel):
     ag_key = "LR"
     ag_name = "LinearModel"
     ag_priority = 30
+    seed_name = "random_state"
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
@@ -155,7 +156,7 @@ class LinearModel(AbstractModel):
         return self._pipeline.fit_transform(X)
     def _set_default_params(self):
-        default_params = {"random_state": 0, "fit_intercept": True}
+        default_params = {"fit_intercept": True}
         if self.problem_type != REGRESSION:
             default_params.update({"solver": _get_solver(self.problem_type)})
         default_params.update(get_param_baseline())
@@ -319,6 +320,10 @@ class LinearModel(AbstractModel):
     ) -> int:
         return 4 * get_approximate_df_mem_usage(X).sum()
+    def _get_maximum_resources(self) -> dict[str, int | float]:
+        # no GPU support
+        return {"num_gpus": 0}
     @classmethod
     def supported_problem_types(cls) -> list[str] | None:
         return ["binary", "multiclass", "regression"]

autogluon/tabular/models/lr/lr_preprocessing_utils.py CHANGED Viewed

@@ -5,20 +5,19 @@ from autogluon.features.generators import OneHotEncoderFeatureGenerator
 class OheFeaturesGenerator(BaseEstimator, TransformerMixin):
     def __init__(self):
-        self._feature_names = []
-        self._encoder = None
+        pass
     def fit(self, X, y=None):
-        self._encoder = OneHotEncoderFeatureGenerator(max_levels=10000, verbosity=0)
-        self._encoder.fit(X)
-        self._feature_names = self._encoder.features_out
+        self.encoder_ = OneHotEncoderFeatureGenerator(max_levels=10000, verbosity=0)
+        self.encoder_.fit(X)
+        self.feature_names_ = self.encoder_.features_out
         return self
     def transform(self, X, y=None):
-        return self._encoder.transform_ohe(X)
+        return self.encoder_.transform_ohe(X)
     def get_feature_names(self):
-        return self._feature_names
+        return self.feature_names_
 class NlpDataPreprocessor(BaseEstimator, TransformerMixin):

autogluon.tabular 1.3.2b20250610__py3-none-any.whl → 1.4.1b20251214__py3-none-any.whl

autogluon.tabular 1.3.2b20250610py3-none-any.whl → 1.4.1b20251214py3-none-any.whl